From 21a6dc7c731bbef81f5dd727cab414d16fc02227 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Wed, 22 May 2024 12:49:06 -0700 Subject: [PATCH] c10::optional -> std::optional in deeplearning/fbgemm/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host.cpp +51 (#2623) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2623 Generated with ``` fbgs -f '.*\.(cpp|cxx|cc|h|hpp|cu|cuh)$' c10::optional -l | perl -pe 's/^fbsource.fbcode.//' | grep -v executorch | xargs -n 50 perl -pi -e 's/c10::optional/std::optional/g' ``` - If you approve of this diff, please use the "Accept & Ship" button :-) (51 file modified.) Reviewed By: palmje Differential Revision: D57631089 --- .../embedding_forward_quantized_host.cpp | 44 +++---- .../embedding_forward_quantized_host_cpu.cpp | 38 +++--- .../embedding_backward_dense_host.cpp | 8 +- .../embedding_backward_dense_host_cpu.cpp | 8 +- ...dding_backward_split_host_cpu_template.cpp | 8 +- ...embedding_backward_split_host_template.cpp | 28 ++--- .../embedding_backward_split_template.cu | 4 +- ...dding_split_host_pt2_autograd_template.cpp | 24 ++-- .../codegen/utils/embedding_bounds_check.cu | 4 +- .../utils/embedding_bounds_check_host.cpp | 4 +- .../utils/embedding_bounds_check_host_cpu.cpp | 4 +- .../gen_ai/src/attention/gqa_attn_splitk.cu | 2 +- .../gen_ai/src/quantize/cutlass_extensions.cu | 12 +- .../gen_ai/src/quantize/quantize.cpp | 34 +++--- .../gen_ai/src/quantize/quantize.cu | 36 +++--- fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h | 2 +- .../fbgemm_gpu/embedding_inplace_update.h | 8 +- fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h | 108 +++++++++--------- .../include/fbgemm_gpu/sparse_ops_utils.h | 16 +-- .../split_embeddings_cache_cuda.cuh | 28 ++--- .../fbgemm_gpu/split_embeddings_utils.cuh | 6 +- .../embedding_inplace_update.cu | 4 +- .../embedding_inplace_update_cpu.cpp | 4 +- .../dense_to_jagged_forward.cu | 2 +- .../jagged_tensor_ops_autograd.cpp | 12 +- .../jagged_tensor_ops_cpu.cpp | 12 +- .../keyed_jagged_index_select_dim1.cu | 12 +- fbgemm_gpu/src/memory_utils/memory_utils.cu | 2 +- .../sparse_batched_unary_embeddings.cu | 2 +- .../sparse_block_bucketize_features.cu | 46 ++++---- .../sparse_ops/sparse_bucketize_features.cu | 4 +- fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 106 ++++++++--------- fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp | 6 +- .../src/sparse_ops/sparse_permute_1d.cu | 6 +- .../src/sparse_ops/sparse_permute_2d.cu | 10 +- .../sparse_ops/sparse_permute_embeddings.cu | 4 +- fbgemm_gpu/src/sparse_ops/sparse_range.cu | 2 +- .../src/split_embeddings_cache/common.h | 14 +-- .../linearize_cache_indices.cpp | 2 +- .../linearize_cache_indices.cu | 12 +- .../split_embeddings_cache/lru_cache_find.cu | 4 +- .../lru_cache_populate.cu | 4 +- .../lru_cache_populate_byte.cpp | 4 +- .../lru_cache_populate_byte.cu | 4 +- .../src/split_embeddings_cache/lxu_cache.cpp | 8 +- .../src/split_embeddings_cache/lxu_cache.cu | 10 +- .../reset_weight_momentum.cu | 4 +- .../transpose_embedding_input.cu | 4 +- fbgemm_gpu/test/sparse/utils_test.cpp | 2 +- 49 files changed, 366 insertions(+), 366 deletions(-) diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host.cpp index dd808c592..1010da33d 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host.cpp +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host.cpp @@ -275,14 +275,14 @@ Tensor int_nbit_split_embedding_codegen_lookup_function( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, + std::optional indice_weights, int64_t output_dtype, - c10::optional lxu_cache_weights, - c10::optional lxu_cache_locations, - c10::optional row_alignment, - c10::optional max_float8_D, - c10::optional fp8_exponent_bits, - c10::optional fp8_exponent_bias) { + std::optional lxu_cache_weights, + std::optional lxu_cache_locations, + std::optional row_alignment, + std::optional max_float8_D, + std::optional fp8_exponent_bits, + std::optional fp8_exponent_bias) { if (static_cast(pooling_mode) == PoolingMode::NONE) { std::vector max_D_list{ max_int2_D, @@ -390,29 +390,29 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, + std::optional indice_weights, int64_t output_dtype, - c10::optional lxu_cache_weights, - c10::optional lxu_cache_locations, - c10::optional row_alignment, - c10::optional max_float8_D, - c10::optional fp8_exponent_bits, - c10::optional fp8_exponent_bias, + std::optional lxu_cache_weights, + std::optional lxu_cache_locations, + std::optional row_alignment, + std::optional max_float8_D, + std::optional fp8_exponent_bits, + std::optional fp8_exponent_bias, // Additional args for UVM_CACHING. // cache_hash_size_cumsum: cumulative sum of # embedding rows of all the // tables. 1D tensor, dtype=int64. - c10::optional cache_hash_size_cumsum, + std::optional cache_hash_size_cumsum, // total_cache_hash_size: sum of # embedding rows of all the tables. - c10::optional total_cache_hash_size, + std::optional total_cache_hash_size, // cache_index_table_map: (linearized) index to table number map. // 1D tensor, dtype=int32. - c10::optional cache_index_table_map, + std::optional cache_index_table_map, // lxu_cache_state: Cache state (cached index, or invalid). // 2D tensor: # sets x assoc. dtype=int64. - c10::optional lxu_cache_state, + std::optional lxu_cache_state, // lxu_state: meta info for replacement (time stamp for LRU). // 2D tensor: # sets x assoc. dtype=int64. - c10::optional lxu_state) { + std::optional lxu_state) { // This function does prefetch() and foward() methods in // IntNBitTableBatchedEmbeddingBagsCodegen, but run them in sequence. // Prefetching of multiple batches of requests is not yet supported. @@ -435,7 +435,7 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( cache_hash_size_cumsum.value(), indices, offsets, - /*B_offsets=*/c10::optional(), + /*B_offsets=*/std::optional(), /*max_B=*/-1, /*indices_base_offset=*/0); @@ -506,8 +506,8 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( total_cache_hash_size.value(), gather_uvm_stats, uvm_cache_stats, - c10::optional(), // num_uniq_cache_indices - c10::optional() // lxu_cache_locations_output + std::optional(), // num_uniq_cache_indices + std::optional() // lxu_cache_locations_output ); #ifdef FBCODE_CAFFE2 diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp index 1bf00c5e2..c784b1b15 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_host_cpu.cpp @@ -93,16 +93,16 @@ Tensor int_nbit_split_embedding_codegen_lookup_function_cpu( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, + std::optional indice_weights, int64_t output_dtype, - c10::optional + std::optional lxu_cache_weights, // Not used, to match cache interface for CUDA op - c10::optional + std::optional lxu_cache_locations, // Not used, to match cache interface for CUDA op - c10::optional row_alignment, - c10::optional max_float8_D, - c10::optional fp8_exponent_bits, - c10::optional fp8_exponent_bias) { + std::optional row_alignment, + std::optional max_float8_D, + std::optional fp8_exponent_bits, + std::optional fp8_exponent_bias) { if (static_cast(pooling_mode) == PoolingMode::NONE) { std::vector max_D_list{ max_int2_D, @@ -179,20 +179,20 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, + std::optional indice_weights, int64_t output_dtype, - c10::optional lxu_cache_weights, - c10::optional lxu_cache_locations, - c10::optional row_alignment, - c10::optional max_float8_D, - c10::optional fp8_exponent_bits, - c10::optional fp8_exponent_bias, + std::optional lxu_cache_weights, + std::optional lxu_cache_locations, + std::optional row_alignment, + std::optional max_float8_D, + std::optional fp8_exponent_bits, + std::optional fp8_exponent_bias, // Additinal args for uvm_caching version. - c10::optional cache_hash_size_cumsum [[maybe_unused]], - c10::optional total_cache_hash_size [[maybe_unused]], - c10::optional cache_index_table_map [[maybe_unused]], - c10::optional lxu_cache_state [[maybe_unused]], - c10::optional lxu_state [[maybe_unused]]) { + std::optional cache_hash_size_cumsum [[maybe_unused]], + std::optional total_cache_hash_size [[maybe_unused]], + std::optional cache_index_table_map [[maybe_unused]], + std::optional lxu_cache_state [[maybe_unused]], + std::optional lxu_state [[maybe_unused]]) { LOG(WARNING) << "int_nbit_split_embedding_uvm_caching_codegen_lookup_function shouldn't be called for CPU; it is only for GPU."; return int_nbit_split_embedding_codegen_lookup_function_cpu( diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host.cpp index 74817fd63..db72a53d0 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host.cpp +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host.cpp @@ -99,8 +99,8 @@ class SplitLookupFunction_Dense_Op Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, - c10::optional feature_requires_grad) { + std::optional indice_weights, + std::optional feature_requires_grad) { ctx->save_for_backward({ dev_weights, weights_offsets, @@ -388,8 +388,8 @@ Tensor split_embedding_codegen_lookup_dense_function( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, - c10::optional feature_requires_grad, + std::optional indice_weights, + std::optional feature_requires_grad, int64_t output_dtype = static_cast(SparseType::FP32)) { if (static_cast(pooling_mode) == PoolingMode::NONE) { return SplitNoBagLookupFunction_Dense_Op::apply( diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp index f114ab203..ebda35205 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_dense_host_cpu.cpp @@ -48,8 +48,8 @@ class SplitLookupFunction_Dense_Op Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, - c10::optional feature_requires_grad) { + std::optional indice_weights, + std::optional feature_requires_grad) { Tensor indice_weights_value = indice_weights.value_or(Tensor()); Tensor feature_requires_grad_value = feature_requires_grad.value_or(Tensor()); @@ -161,8 +161,8 @@ Tensor split_embedding_codegen_lookup_dense_function( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, - c10::optional feature_requires_grad, + std::optional indice_weights, + std::optional feature_requires_grad, int64_t /* output_dtype = static_cast(SparseType::FP32) */) { return SplitLookupFunction_Dense_Op::apply( host_weights, diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp index 18dfea921..2b5ec1465 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_cpu_template.cpp @@ -57,8 +57,8 @@ class SplitLookupFunction_{{ optimizer }}_Op : public torch::autograd::Function< Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, - c10::optional feature_requires_grad, + std::optional indice_weights, + std::optional feature_requires_grad, bool gradient_clipping, double max_gradient, bool stochastic_rounding, @@ -208,8 +208,8 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_cpu( Tensor indices, Tensor offsets, int64_t pooling_mode, - c10::optional indice_weights, - c10::optional feature_requires_grad, + std::optional indice_weights, + std::optional feature_requires_grad, bool gradient_clipping, double max_gradient, bool stochastic_rounding, diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp index 8fcc77cce..7ff1266dd 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp @@ -470,20 +470,20 @@ class {{ autograd_func }} : const Tensor& offsets, {%- if not nobag %} const int64_t pooling_mode, - const c10::optional& indice_weights, - const c10::optional& feature_requires_grad, + const std::optional& indice_weights, + const std::optional& feature_requires_grad, {%- endif %} const Tensor& lxu_cache_locations, - c10::optional uvm_cache_stats, + std::optional uvm_cache_stats, {%- if optimizer != "none" %} const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, {%- endif %} {%- if vbe %} - const c10::optional& B_offsets, - const c10::optional& vbe_output_offsets_feature_rank, - const c10::optional& vbe_B_offsets_rank_per_feature, + const std::optional& B_offsets, + const std::optional& vbe_output_offsets_feature_rank, + const std::optional& vbe_B_offsets_rank_per_feature, const c10::SymInt max_B, const c10::SymInt max_B_feature_rank, const c10::SymInt vbe_output_size, @@ -493,7 +493,7 @@ class {{ autograd_func }} : const bool use_homogeneous_placements, {%- if is_gwd %} {%- if "prev_iter_dev" not in args.split_function_arg_names %} - const c10::optional& prev_iter_dev, + const std::optional& prev_iter_dev, {%- endif %} {%- if "iter" not in args.split_function_arg_names %} const int64_t iter, @@ -790,8 +790,8 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function( const Tensor& indices, const Tensor& offsets, const int64_t pooling_mode, - const c10::optional& indice_weights, - const c10::optional& feature_requires_grad, + const std::optional& indice_weights, + const std::optional& feature_requires_grad, const Tensor& lxu_cache_locations, {%- if optimizer != "none" %} const bool gradient_clipping, @@ -800,18 +800,18 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function( {%- endif %} {{ args.split_function_args | join(", ") }}, const int64_t output_dtype = static_cast(SparseType::FP32), - const c10::optional& B_offsets = c10::nullopt, - const c10::optional& vbe_output_offsets_feature_rank = c10::nullopt, - const c10::optional& vbe_B_offsets_rank_per_feature = c10::nullopt, + const std::optional& B_offsets = c10::nullopt, + const std::optional& vbe_output_offsets_feature_rank = c10::nullopt, + const std::optional& vbe_B_offsets_rank_per_feature = c10::nullopt, const c10::SymInt max_B = -1, const c10::SymInt max_B_feature_rank = -1, const c10::SymInt vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false, - const c10::optional& uvm_cache_stats = c10::nullopt, + const std::optional& uvm_cache_stats = c10::nullopt, {%- if "prev_iter_dev" not in args.split_function_arg_names %} - const c10::optional& prev_iter_dev = c10::nullopt, + const std::optional& prev_iter_dev = c10::nullopt, {%- endif %} {%- if "iter" not in args.split_function_arg_names %} const int64_t iter = 0, diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index a31bc8c39..77a0b4f4d 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -661,13 +661,13 @@ Tensor split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_e indices, {{ "offsets" if not is_index_select else "Tensor()" }}, {{ "true" if nobag else "false" }}, - {{ "c10::optional(vbe_b_t_map)" if vbe else "c10::optional()" }}, + {{ "std::optional(vbe_b_t_map)" if vbe else "std::optional()" }}, info_B_num_bits, info_B_mask, total_unique_indices, {%- if is_index_select %} true, // is_index_select - c10::optional(total_L_offsets), + std::optional(total_L_offsets), fixed_L_per_warp, num_warps_per_feature {%- else %} diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp index 690f408e8..4ab7fb313 100644 --- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp +++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp @@ -87,20 +87,20 @@ class {{ autograd_func }} : const Tensor& offsets, {%- if not nobag %} const int64_t pooling_mode, - const c10::optional& indice_weights, - const c10::optional& feature_requires_grad, + const std::optional& indice_weights, + const std::optional& feature_requires_grad, {%- endif %} const Tensor& lxu_cache_locations, - c10::optional uvm_cache_stats, + std::optional uvm_cache_stats, {%- if optimizer != "none" %} const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, {%- endif %} {%- if vbe %} - const c10::optional& B_offsets, - const c10::optional& vbe_output_offsets_feature_rank, - const c10::optional& vbe_B_offsets_rank_per_feature, + const std::optional& B_offsets, + const std::optional& vbe_output_offsets_feature_rank, + const std::optional& vbe_B_offsets_rank_per_feature, const c10::SymInt max_B, const c10::SymInt max_B_feature_rank, const c10::SymInt vbe_output_size, @@ -773,8 +773,8 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_pt2( const Tensor& indices, const Tensor& offsets, const int64_t pooling_mode, - const c10::optional& indice_weights, - const c10::optional& feature_requires_grad, + const std::optional& indice_weights, + const std::optional& feature_requires_grad, const Tensor& lxu_cache_locations, {%- if optimizer != "none" %} const bool gradient_clipping, @@ -783,16 +783,16 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_pt2( {%- endif %} {{ args_pt2.split_function_args | join(", ") }}, const int64_t output_dtype = static_cast(SparseType::FP32), - const c10::optional& B_offsets = c10::optional(), - const c10::optional& vbe_output_offsets_feature_rank = c10::optional(), - const c10::optional& vbe_B_offsets_rank_per_feature = c10::optional(), + const std::optional& B_offsets = std::optional(), + const std::optional& vbe_output_offsets_feature_rank = std::optional(), + const std::optional& vbe_B_offsets_rank_per_feature = std::optional(), const c10::SymInt max_B = -1, const c10::SymInt max_B_feature_rank = -1, const c10::SymInt vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false, - const c10::optional& uvm_cache_stats = c10::optional()) { + const std::optional& uvm_cache_stats = std::optional()) { {%- for vbe in ([True, False] if has_vbe_support else [False]) %} {%- if has_vbe_support %} {%- if vbe %} diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu index 0a710cc38..08e22baa9 100644 --- a/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu +++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check.cu @@ -184,8 +184,8 @@ void bounds_check_indices_cuda( Tensor& offsets, int64_t bounds_check_mode_, Tensor& warning, - const c10::optional& weights, - const c10::optional& B_offsets, + const std::optional& weights, + const std::optional& B_offsets, const int64_t max_B) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( rows_per_table, indices, offsets, warning, weights, B_offsets); diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp index 6913c59a2..776979f1a 100644 --- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp +++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp @@ -25,8 +25,8 @@ void bounds_check_indices_cuda( Tensor& offsets, int64_t bounds_check_mode, Tensor& warning, - const c10::optional& weights, - const c10::optional& B_ofsets, + const std::optional& weights, + const std::optional& B_ofsets, const int64_t max_B); // Deprecated for fb namespace! Please use fbgemm namespace instead! diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp index cc03d9ec2..cdf1db7cd 100644 --- a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp +++ b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host_cpu.cpp @@ -45,8 +45,8 @@ void bounds_check_indices_cpu( Tensor& offsets, int64_t bounds_check_mode_, Tensor& warning, - const c10::optional& weights, - const c10::optional& B_offsets, + const std::optional& weights, + const std::optional& B_offsets, const int64_t /*max_B*/) { TORCH_CHECK( !B_offsets.has_value(), diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu index 0f6fff96c..219833d9d 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu @@ -1836,7 +1836,7 @@ std::tuple gqa_attn_splitk_impl( const at::Tensor& seq_positions, // [B] const double qk_scale, const int64_t split_k, - const c10::optional& num_groups) { + const std::optional& num_groups) { at::OptionalDeviceGuard guard(XQ.device()); TORCH_CHECK(XQ.is_cuda()); TORCH_CHECK(cache_K.is_cuda()); diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu index dd76be16e..73512fb25 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu @@ -1014,7 +1014,7 @@ at::Tensor f8f8bf16_rowwise_impl( at::Tensor WQ, // FP8 at::Tensor x_scale, at::Tensor w_scale, - c10::optional bias) { + std::optional bias) { int M = XQ.size(0); int N = WQ.size(0); int K = XQ.size(1); @@ -1265,7 +1265,7 @@ at::Tensor dispatch_fp8_rowwise_kernel( at::Tensor WQ, at::Tensor x_scale, at::Tensor w_scale, - c10::optional bias) { + std::optional bias) { KernelMode kernel = get_kernel_mode(XQ, WQ); if (kernel == KernelMode::Small) { return f8f8bf16_rowwise_impl< @@ -1314,7 +1314,7 @@ at::Tensor f8f8bf16_rowwise( at::Tensor WQ, // FP8 at::Tensor x_scale, // FP32 at::Tensor w_scale, // FP32 - c10::optional bias = c10::nullopt, // BF16 + std::optional bias = c10::nullopt, // BF16 bool use_fast_accum = true) { // Check datatypes. TORCH_CHECK( @@ -1743,7 +1743,7 @@ at::Tensor f8f8bf16_cublas( at::Tensor Ainvs, at::Tensor Binvs, bool use_fast_accum = true, - c10::optional output = c10::nullopt) { + std::optional output = c10::nullopt) { auto m = A.size(0); auto n = B.size(0); auto k = A.size(1); @@ -1886,7 +1886,7 @@ at::Tensor f8f8bf16_cublas( at::Tensor Ainvs, at::Tensor Binvs, bool use_fast_accum, - c10::optional output) { + std::optional output) { throw std::runtime_error( "CUDA version is older than 12.0"); // requires CUDA>=12 } @@ -1920,7 +1920,7 @@ at::Tensor f8f8bf16_rowwise( at::Tensor WQ, // FP8 at::Tensor x_scale, at::Tensor w_scale, - c10::optional bias = c10::nullopt, + std::optional bias = c10::nullopt, bool use_fast_accum = true) { throw std::runtime_error( "CUDA version is older than 12.0"); // requires CUDA>=12 diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp index 91abab3a3..0aa78bcd5 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cpp @@ -48,7 +48,7 @@ at::Tensor f8f8bf16_rowwise( at::Tensor WQ, at::Tensor x_scale, at::Tensor w_scale, - c10::optional bias = c10::nullopt, + std::optional bias = c10::nullopt, bool use_fast_accum = true); at::Tensor f8f8bf16_cublas( at::Tensor A, @@ -56,7 +56,7 @@ at::Tensor f8f8bf16_cublas( at::Tensor Ainvs, at::Tensor Binvs, bool use_fast_accum, - c10::optional output); + std::optional output); at::Tensor f8i4bf16_rowwise( at::Tensor XQ, at::Tensor WQ, @@ -69,31 +69,31 @@ std::tuple per_tensor_dynamic_quantize_i8(at::Tensor X); std::vector quantize_fp8_per_tensor( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub); // scale upperbound + std::optional bs, // batch size + std::optional scale_ub); // scale upperbound std::vector quantize_fp8_per_row( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub, // scale upperbound - c10::optional output_dtype); // output dtype + std::optional bs, // batch size + std::optional scale_ub, // scale upperbound + std::optional output_dtype); // output dtype #if CUDART_VERSION >= 12000 std::vector quantize_fp8_per_col( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub); // scale upperbound + std::optional bs, // batch size + std::optional scale_ub); // scale upperbound #endif at::Tensor quantize_fp8_per_tensor_fixed_scale( at::Tensor input, at::Tensor scale, - c10::optional bs); + std::optional bs); at::Tensor get_fp8_per_tensor_scale( at::Tensor input, - c10::optional bs, - c10::optional scale_ub); // scale upperbound + std::optional bs, + std::optional scale_ub); // scale upperbound TORCH_LIBRARY_FRAGMENT(fbgemm, m) { #ifndef USE_ROCM @@ -133,7 +133,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { // TODO: On AMD this throws "undefined symbol: // _ZN8facebook6gen_ai13llm_inference23quantize_fp8_per_tensorEN2at6TensorEN3c108optionalIS3_EE" // i.e. facebook::gen_ai::llm_inference::quantize_fp8_per_tensor(at::Tensor, - // c10::optional) when loading + // std::optional) when loading // quantize_ops with // torch.ops.load_library m.def( @@ -188,7 +188,7 @@ at::Tensor f8f8bf16_rowwise_meta( at::Tensor WQ, // FP8 at::Tensor x_scale, at::Tensor w_scale, - c10::optional bias = c10::nullopt, + std::optional bias = c10::nullopt, bool use_fast_accum = true) { int M = XQ.size(0); int N = WQ.size(0); @@ -198,8 +198,8 @@ at::Tensor f8f8bf16_rowwise_meta( std::vector quantize_fp8_per_tensor_meta( at::Tensor X, - c10::optional bs, - c10::optional scale_ub) { + std::optional bs, + std::optional scale_ub) { auto Y = at::empty_like(X, X.options().dtype(at::kFloat8_e4m3fn)); auto scale = at::empty({}, X.options().dtype(at::kBFloat16)); return {Y, scale}; @@ -211,7 +211,7 @@ at::Tensor f8f8bf16_cublas_meta( at::Tensor x_scale, at::Tensor w_scale, bool use_fast_accum = true, - c10::optional output = c10::nullopt) { + std::optional output = c10::nullopt) { const at::SymInt M = X.sym_size(0); const at::SymInt N = W.sym_size(0); auto Y = at::empty_symint({M, N}, X.options().dtype(at::kBFloat16)); diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu index b8846fd32..7846f59d7 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu @@ -607,8 +607,8 @@ void invokeComputeScale( at::Tensor get_fp8_per_tensor_scale( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub) // scale upper bound + std::optional bs, // batch size + std::optional scale_ub) // scale upper bound { CUDA_DEVICE_GUARD(input); TORCH_CHECK(input.numel() != 0, "input should not be empty tensor"); @@ -644,7 +644,7 @@ at::Tensor get_fp8_per_tensor_scale( at::Tensor quantize_fp8_per_tensor_fixed_scale( at::Tensor input, at::Tensor scale, - c10::optional bs) // batch size + std::optional bs) // batch size { CUDA_DEVICE_GUARD(input); TORCH_CHECK(input.numel() != 0, "input should not be empty tensor"); @@ -682,8 +682,8 @@ at::Tensor quantize_fp8_per_tensor_fixed_scale( // usecases/models when needed std::vector quantize_fp8_per_tensor( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub) // scale upperbound) + std::optional bs, // batch size + std::optional scale_ub) // scale upperbound) { CUDA_DEVICE_GUARD(input); TORCH_CHECK(input.numel() != 0, "input should not be empty tensor"); @@ -894,9 +894,9 @@ void invokeComputeScalesAndQuantizeMatrixCol( std::vector quantize_fp8_per_row( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub, // scale upperbound - c10::optional output_dtype) // Quantization type + std::optional bs, // batch size + std::optional scale_ub, // scale upperbound + std::optional output_dtype) // Quantization type { TORCH_CHECK(input.numel() != 0, "input should not be empty tensor"); TORCH_CHECK( @@ -969,8 +969,8 @@ std::vector quantize_fp8_per_row( std::vector quantize_fp8_per_col( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub) // scale upperbound) + std::optional bs, // batch size + std::optional scale_ub) // scale upperbound) { CUDA_DEVICE_GUARD(input); TORCH_CHECK(input.numel() != 0, "input should not be empty tensor"); @@ -1014,17 +1014,17 @@ std::vector quantize_fp8_per_col( #else std::vector quantize_fp8_per_tensor( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub) { // scale upperbound + std::optional bs, // batch size + std::optional scale_ub) { // scale upperbound throw std::runtime_error( "CUDA version is older than 12.0"); // requires CUDA>=12 } std::vector quantize_fp8_per_row( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub, // scale upperbound - c10::optional output_dtype) { // quantization type + std::optional bs, // batch size + std::optional scale_ub, // scale upperbound + std::optional output_dtype) { // quantization type throw std::runtime_error( "CUDA version is older than 12.0"); // requires CUDA>=12 } @@ -1032,15 +1032,15 @@ std::vector quantize_fp8_per_row( at::Tensor quantize_fp8_per_tensor_fixed_scale( at::Tensor input, at::Tensor scale, - c10::optional bs) { // batch size + std::optional bs) { // batch size throw std::runtime_error( "CUDA version is older than 12.0"); // requires CUDA>=12 } at::Tensor get_fp8_per_tensor_scale( at::Tensor input, - c10::optional bs, // batch size - c10::optional scale_ub) { // scale upperbound + std::optional bs, // batch size + std::optional scale_ub) { // scale upperbound throw std::runtime_error( "CUDA version is older than 12.0"); // requires CUDA>=12 } diff --git a/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h b/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h index e41f74619..7d7fcd0b8 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h +++ b/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h @@ -153,7 +153,7 @@ void uvm_cuda_mem_advise(const Tensor& self, int64_t cuda_memory_advise); /// for more information on `cudaMemPrefetchAsync()`. void uvm_cuda_mem_prefetch_async( const Tensor& self, - c10::optional device_t); + std::optional device_t); /// @ingroup cumem-utils /// diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h index 4b97ed9f4..802ab9cb0 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h +++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h @@ -55,8 +55,8 @@ void embedding_inplace_update_cuda( Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, - c10::optional lxu_cache_weights = c10::nullopt, - c10::optional lxu_cache_locations = c10::nullopt); + std::optional lxu_cache_weights = c10::nullopt, + std::optional lxu_cache_locations = c10::nullopt); void embedding_inplace_update_cpu( Tensor dev_weights, @@ -70,9 +70,9 @@ void embedding_inplace_update_cpu( Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, - c10::optional lxu_cache_weights = + std::optional lxu_cache_weights = c10::nullopt, // Not used, to match cache interface for CUDA op - c10::optional lxu_cache_locations = + std::optional lxu_cache_locations = c10::nullopt // Not used, to match cache interface for CUDA op ); diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h index 0029c6a96..d224f2273 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h +++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h @@ -90,21 +90,21 @@ at::Tensor segment_sum_csr_cpu( ///@see You can find more info here -std::tuple> +std::tuple> permute_2D_sparse_data_cuda( const at::Tensor& permute, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum); + const std::optional& weights, + const std::optional& permuted_lengths_sum); -std::tuple> +std::tuple> permute_1D_sparse_data_cuda( const at::Tensor& permute, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum); + const std::optional& weights, + const std::optional& permuted_lengths_sum); at::Tensor invert_permute_cuda(const at::Tensor& permute); #endif @@ -143,9 +143,9 @@ at::Tensor expand_into_jagged_permute_cpu( std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional> ///@ingroup sparse-data-cuda block_bucketize_sparse_features_cuda( @@ -155,17 +155,17 @@ block_bucketize_sparse_features_cuda( const bool sequence, const at::Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_batch_size, - const c10::optional>& block_bucketize_pos); + const std::optional>& block_bucketize_pos); std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional> ///@ingroup sparse-data-cpu block_bucketize_sparse_features_cpu( @@ -175,18 +175,18 @@ block_bucketize_sparse_features_cpu( const bool sequence, const at::Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_batch_size, - const c10::optional>& block_bucketize_pos); + const std::optional>& block_bucketize_pos); std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional, + std::optional> ///@ingroup sparse-data-cuda block_bucketize_sparse_features_inference_cuda( const at::Tensor& lengths, @@ -195,10 +195,10 @@ block_bucketize_sparse_features_inference_cuda( const bool sequence, const at::Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_batch_size, - const c10::optional>& block_bucketize_pos, + const std::optional>& block_bucketize_pos, const bool return_bucket_mapping); ///@ingroup sparse-data-cuda @@ -210,10 +210,10 @@ at::Tensor populate_bucketized_permute_cuda( std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional, + std::optional> ///@ingroup sparse-data-cpu block_bucketize_sparse_features_inference_cpu( @@ -223,10 +223,10 @@ block_bucketize_sparse_features_inference_cpu( const bool sequence, const at::Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_batch_size, - const c10::optional>& block_bucketize_pos, + const std::optional>& block_bucketize_pos, const bool return_bucket_mapping); ///@ingroup sparse-data-cpu @@ -238,8 +238,8 @@ at::Tensor populate_bucketized_permute_cpu( std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional> + std::optional, + std::optional> ///@ingroup sparse-data-cuda bucketize_sparse_features_cuda( @@ -247,38 +247,38 @@ bucketize_sparse_features_cuda( const at::Tensor& indices, const bool bucketize_pos, const int64_t my_size, - const c10::optional& weights); + const std::optional& weights); std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional> + std::optional, + std::optional> ///@ingroup sparse-data-cpu bucketize_sparse_features_cpu( const at::Tensor& lengths, const at::Tensor& indices, const bool bucketize_pos, const int64_t my_size, - const c10::optional& weights); + const std::optional& weights); ///@ingroup sparse-data-cpu -std::tuple> +std::tuple> permute_2D_sparse_data_cpu( const at::Tensor& permute, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum); + const std::optional& weights, + const std::optional& permuted_lengths_sum); ///@ingroup sparse-data-cpu -std::tuple> +std::tuple> permute_1D_sparse_data_cpu( const at::Tensor& permute, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum); + const std::optional& weights, + const std::optional& permuted_lengths_sum); at::Tensor _float_to_fused8bitrowwise_gpu(const at::Tensor& input); at::Tensor _float_to_paddedFP8rowwise_gpu( @@ -572,7 +572,7 @@ std::tuple> jagged_dense_elementwise_mul( std::tuple> dense_to_jagged( const at::Tensor& dense, const std::vector& offsets, - c10::optional total_L); + std::optional total_L); std::tuple> jagged_dense_elementwise_add_jagged_output( @@ -786,39 +786,39 @@ std::tuple embedding_bag_rowwise_prune( at::ScalarType compressed_indices_dtype, const bool abs, const int64_t min_non_pruned_rows, - const c10::optional& min_save_ratio); + const std::optional& min_save_ratio); ///@ingroup sparse-data-cpu at::Tensor lengths_range( const at::Tensor& t_in, - const c10::optional>& shape); + const std::optional>& shape); ///@ingroup sparse-data-cpu at::Tensor& lengths_range_out( at::Tensor& output, const at::Tensor& t_in, - const c10::optional>& shape); + const std::optional>& shape); ///@ingroup sparse-data-cuda at::Tensor lengths_range_cuda( const at::Tensor& t_in, - const c10::optional>& shape); -std::tuple> + const std::optional>& shape); +std::tuple> ///@ingroup sparse-data-cpu permute_sparse_features_cpu( const at::Tensor& permute, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights); + const std::optional& weights); ///@ingroup sparse-data-cuda -std::tuple> +std::tuple> permute_sparse_features_cuda( const at::Tensor& permute, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights); + const std::optional& weights); ///@ingroup sparse-data-cuda at::Tensor permute102_baddbmm_permute102_cuda( @@ -905,7 +905,7 @@ std::vector jagged_index_select_2d( const at::Tensor& values, const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional num_dense_output_rows = c10::nullopt); + const std::optional num_dense_output_rows = c10::nullopt); at::Tensor jagged_index_select_2d_forward_cpu( const at::Tensor& values, diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h index 85a557623..6767b8659 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h +++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h @@ -17,7 +17,7 @@ inline bool torch_tensor_on_cpu_check(const at::Tensor& ten) { return ten.is_cpu(); } -inline bool torch_tensor_on_cpu_check(const c10::optional& ten) { +inline bool torch_tensor_on_cpu_check(const std::optional& ten) { return !ten.has_value() || torch_tensor_on_cpu_check(ten.value()); } @@ -27,7 +27,7 @@ inline std::optional get_device_index_from_tensor( } inline std::optional get_device_index_from_tensor( - const c10::optional& ten) { + const std::optional& ten) { if (ten) { return {ten->device().index()}; } else { @@ -40,7 +40,7 @@ inline std::string torch_tensor_device_name(const at::Tensor& ten) { } inline std::string torch_tensor_device_name( - const c10::optional& ten) { + const std::optional& ten) { if (ten.has_value()) { return torch_tensor_device_name(ten.value()); } else { @@ -56,7 +56,7 @@ inline bool torch_tensor_on_same_device_check( inline bool torch_tensor_on_same_device_check( const at::Tensor& ten1, - const c10::optional& ten2) { + const std::optional& ten2) { return !ten2.has_value() || ten1.get_device() == ten2->get_device(); } @@ -64,7 +64,7 @@ inline bool torch_tensor_undefined(const at::Tensor& ten) { return ten.defined(); } -inline bool torch_tensor_undefined(const c10::optional& ten) { +inline bool torch_tensor_undefined(const std::optional& ten) { return !ten.has_value() || torch_tensor_undefined(ten.value()); } @@ -73,7 +73,7 @@ inline bool torch_tensor_on_cuda_gpu_check(const at::Tensor& ten) { } inline bool torch_tensor_on_cuda_gpu_check( - const c10::optional& ten) { + const std::optional& ten) { return !ten.has_value() || torch_tensor_on_cuda_gpu_check(ten.value()); } @@ -82,7 +82,7 @@ inline bool torch_tensor_empty_or_on_cuda_gpu_check(const at::Tensor& ten) { } inline bool torch_tensor_empty_or_on_cuda_gpu_check( - const c10::optional& ten) { + const std::optional& ten) { return !ten.has_value() || torch_tensor_empty_or_on_cuda_gpu_check(ten.value()); } @@ -92,7 +92,7 @@ inline bool torch_tensor_empty_or_on_cpu_check(const at::Tensor& ten) { } inline bool torch_tensor_empty_or_on_cpu_check( - const c10::optional& ten) { + const std::optional& ten) { return !ten.has_value() || torch_tensor_empty_or_on_cpu_check(ten.value()); } diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh index d3d3d404a..e2e0cb381 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh @@ -28,7 +28,7 @@ enum uvm_cache_stats_index { ///@ingroup table-batched-embed-cuda /// Deduplicate indices. -std::tuple> +std::tuple> get_unique_indices_cuda( const at::Tensor& linear_indices, const int64_t max_indices, @@ -39,8 +39,8 @@ get_unique_indices_cuda( std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional> + std::optional, + std::optional> get_unique_indices_with_inverse_cuda( const at::Tensor& linear_indices, const int64_t max_indices, @@ -50,7 +50,7 @@ get_unique_indices_with_inverse_cuda( ///@ingroup table-batched-embed-cuda /// Lookup LRU cache to find uncached indices, and then sort them based on the /// set. -std::tuple> +std::tuple> lru_cache_find_uncached_cuda( at::Tensor unique_indices, at::Tensor unique_indices_length, @@ -74,7 +74,7 @@ at::Tensor linearize_cache_indices_cuda( const at::Tensor& cache_hash_size_cumsum, const at::Tensor& indices, const at::Tensor& offsets, - const c10::optional& B_offsets, + const std::optional& B_offsets, const int64_t max_B, const int64_t indices_base_offset); @@ -104,9 +104,9 @@ void lru_cache_populate_cuda( at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, - c10::optional uvm_cache_stats, + std::optional uvm_cache_stats, bool lock_cache_line, - c10::optional lxu_cache_locking_counter); + std::optional lxu_cache_locking_counter); ///@ingroup table-batched-embed-cuda /// LRU cache: fetch the rows corresponding to `linear_cache_indices` from @@ -127,7 +127,7 @@ void lru_cache_populate_byte_cuda( at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats); + std::optional uvm_cache_stats); ///@ingroup table-batched-embed-cuda /// Direct-mapped (assoc=1) variant of lru_cache_populate_byte_cuda @@ -147,7 +147,7 @@ void direct_mapped_lru_cache_populate_byte_cuda( at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats); + std::optional uvm_cache_stats); ///@ingroup table-batched-embed-cuda /// LFU cache: fetch the rows corresponding to `linear_cache_indices` from @@ -192,9 +192,9 @@ at::Tensor lxu_cache_lookup_cuda( at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, - c10::optional uvm_cache_stats, - c10::optional num_uniq_cache_indices, - c10::optional lxu_cache_locations_output); + std::optional uvm_cache_stats, + std::optional num_uniq_cache_indices, + std::optional lxu_cache_locations_output); at::Tensor emulate_cache_miss( at::Tensor lxu_cache_locations, @@ -211,7 +211,7 @@ at::Tensor direct_mapped_lxu_cache_lookup_cuda( at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, - c10::optional uvm_cache_stats); + std::optional uvm_cache_stats); //////@ingroup table-batched-embed-cuda /// Flush the cache: store the weights from the cache to the backing storage. @@ -261,4 +261,4 @@ void lxu_cache_locking_counter_decrement_cuda( void lxu_cache_locations_update_cuda( at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, - c10::optional num_uniq_cache_indices); + std::optional num_uniq_cache_indices); diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh index 45c08e102..42fe5eb4c 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh @@ -40,13 +40,13 @@ transpose_embedding_input( at::Tensor indices, at::Tensor offsets, bool nobag = false, - const c10::optional& vbe_b_t_map = c10::optional(), + const std::optional& vbe_b_t_map = std::optional(), const int64_t info_B_num_bits = 26, const int64_t info_B_mask = 0x2FFFFFF, const int64_t total_unique_indices = -1, const bool is_index_select = false, - const c10::optional& total_L_offsets = - c10::optional(), + const std::optional& total_L_offsets = + std::optional(), const int64_t fixed_L_per_warp = 0, const int64_t num_warps_per_feature = 0); diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu index 857b4eb9a..90af57d07 100644 --- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu +++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu @@ -116,8 +116,8 @@ void embedding_inplace_update_cuda( Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, - c10::optional lxu_cache_weights, - c10::optional lxu_cache_locations) { + std::optional lxu_cache_weights, + std::optional lxu_cache_locations) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( dev_weights, uvm_weights, diff --git a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp index 80ed4db60..c021d2de4 100644 --- a/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp @@ -78,8 +78,8 @@ void embedding_inplace_update_cpu( Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, - c10::optional lxu_cache_weights, - c10::optional lxu_cache_locations) { + std::optional lxu_cache_weights, + std::optional lxu_cache_locations) { TENSOR_ON_CPU(dev_weights); TENSOR_ON_CPU(uvm_weights); TENSOR_ON_CPU(weights_placements); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu index c26710448..06f1e652c 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu @@ -15,7 +15,7 @@ namespace fbgemm_gpu { Tensor dense_to_jagged_forward( const Tensor& dense, const std::vector& offsets, - c10::optional total_L) { + std::optional total_L) { // D is the embedding dimension auto D = dense.size(-1); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp index 2e08efb4d..4a816b937 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp @@ -264,7 +264,7 @@ class DenseToJaggedOp : public torch::autograd::Function { torch::autograd::AutogradContext* ctx, const Tensor& dense, const std::vector& offsets, - const c10::optional& total_L) { + const std::optional& total_L) { ctx->save_for_backward(offsets); // dims of dense tensor: @@ -284,7 +284,7 @@ class DenseToJaggedOp : public torch::autograd::Function { .typed& offsets, - c10::optional total_L)>(); + std::optional total_L)>(); auto output = op.call(dense, offsets, total_L); return {output}; @@ -604,7 +604,7 @@ class JaggedIndexSelect2dOp const Tensor& values, const Tensor& lengths, const Tensor& indices, - const c10::optional optional_num_dense_output_rows) { + const std::optional optional_num_dense_output_rows) { TORCH_CHECK( values.dim() == 2, "jagged_index_select supports only 2D inputs") TENSORS_ON_SAME_DEVICE(lengths, indices); @@ -625,7 +625,7 @@ class JaggedIndexSelect2dOp const Tensor& indices, const Tensor& input_offsets, const Tensor& output_offsets, - const c10::optional)>(); + const std::optional)>(); auto out = op.call( values, @@ -853,7 +853,7 @@ Tensor batched_dense_vec_jagged_2d_mul( std::tuple> dense_to_jagged( const Tensor& dense, const std::vector& offsets, - c10::optional total_L) { + std::optional total_L) { return {DenseToJaggedOp::apply(dense, offsets, total_L)[0], offsets}; } @@ -936,7 +936,7 @@ std::vector jagged_index_select_2d( const Tensor& values, const Tensor& lengths, const Tensor& indices, - const c10::optional num_dense_output_rows) { + const std::optional num_dense_output_rows) { return JaggedIndexSelect2dOp::apply( values, lengths, indices, num_dense_output_rows); } diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index ecc646ac6..d6e0c99a5 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -454,7 +454,7 @@ at::Tensor jagged_to_padded_dense_backward( Tensor dense_to_jagged_forward( const Tensor& dense, const std::vector& offsets, - c10::optional total_L) { + std::optional total_L) { // D is the embedding dimension auto D = dense.size(-1); @@ -488,9 +488,9 @@ at::Tensor jagged_dense_dense_elementwise_add_jagged_output_forward( const at::Tensor& y_1) { // Convert to jagged auto jagged_values_0 = - dense_to_jagged_forward(y_0, x_offsets, c10::optional()); + dense_to_jagged_forward(y_0, x_offsets, std::optional()); auto jagged_values_1 = - dense_to_jagged_forward(y_1, x_offsets, c10::optional()); + dense_to_jagged_forward(y_1, x_offsets, std::optional()); // Add jagged_values + x_values -> sum_values auto sum_values = x_values + jagged_values_0 + jagged_values_1; @@ -635,7 +635,7 @@ Tensor jagged_dense_elementwise_mul_forward( const Tensor& y) { // Convert to jagged auto jagged_values = - dense_to_jagged_forward(y, x_offsets, c10::optional()); + dense_to_jagged_forward(y, x_offsets, std::optional()); // Multiply x_values * jagged_values -> prod_values auto prod_values = x_values * jagged_values; @@ -677,7 +677,7 @@ jagged_dense_elementwise_add_jagged_output_cpu( const Tensor& y) { // Convert to jagged auto jagged_values = - dense_to_jagged_forward(y, x_offsets, c10::optional()); + dense_to_jagged_forward(y, x_offsets, std::optional()); auto sum_values = x_values + jagged_values; @@ -1131,7 +1131,7 @@ Tensor jagged_index_select_2d_forward_v2_impl( const Tensor& indices, const Tensor& input_offsets, const Tensor& output_offsets, - const c10::optional optional_num_dense_output_rows) { + const std::optional optional_num_dense_output_rows) { // Intentionally not using optional::value_or here to avoid materializing // .item() call when possible. const auto num_dense_output_rows = optional_num_dense_output_rows.has_value() diff --git a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu index bafc11111..514a4f6df 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu @@ -174,8 +174,8 @@ class KeyedJaggedIndexSelectDim1GPUOp const Tensor& offsets, const Tensor& indices, const c10::SymInt _batch_size, - const c10::optional& weights, - const c10::optional& selected_lengths_sum) { + const std::optional& weights, + const std::optional& selected_lengths_sum) { at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(values.get_device()); @@ -355,8 +355,8 @@ class KeyedJaggedIndexSelectDim1GPUOp const Tensor& offsets, const Tensor& indices, // select same indices for all batches const c10::SymInt batch_size, - const c10::optional& weights, - const c10::optional& selected_lengths_sum) { + const std::optional& weights, + const std::optional& selected_lengths_sum) { at::AutoDispatchBelowADInplaceOrView guard; // TODO: Add weights support TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, offsets, values, indices); @@ -526,8 +526,8 @@ std::vector keyed_jagged_index_select_dim_1_gpu( const Tensor& offsets, const Tensor& indices, const c10::SymInt batch_size, - const c10::optional& weights, - const c10::optional selected_lengths_sum) { + const std::optional& weights, + const std::optional selected_lengths_sum) { return KeyedJaggedIndexSelectDim1GPUOp::apply( values, lengths, diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu index 46ad46949..6cd27c7b9 100644 --- a/fbgemm_gpu/src/memory_utils/memory_utils.cu +++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu @@ -360,7 +360,7 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) { void uvm_cuda_mem_prefetch_async( const Tensor& t, - c10::optional device_t) { + std::optional device_t) { // Call cudaMemPrefetchAsync on Tensor at::cuda::OptionalCUDAGuard device_guard; TORCH_CHECK(uvm_storage(t)); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu index 894852310..ba6fe2a22 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu @@ -211,7 +211,7 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda( indices, offsets, false, // nobag - c10::optional(), + std::optional(), info_B_num_bits, info_B_mask); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu index f1642111b..3d1b4a14c 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu @@ -323,10 +323,10 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel( std::tuple< Tensor, Tensor, - c10::optional, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional, + std::optional> _block_bucketize_sparse_features_cuda( const Tensor& lengths, const Tensor& indices, @@ -334,10 +334,10 @@ _block_bucketize_sparse_features_cuda( const bool sequence, const Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_B, - const c10::optional>& block_bucketize_pos, + const std::optional>& block_bucketize_pos, const bool return_bucket_mapping) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices); @@ -877,9 +877,9 @@ _block_bucketize_sparse_features_cuda( DLL_PUBLIC std::tuple< Tensor, Tensor, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional> block_bucketize_sparse_features_cuda( const Tensor& lengths, const Tensor& indices, @@ -887,15 +887,15 @@ block_bucketize_sparse_features_cuda( const bool sequence, const Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_B, - const c10::optional>& block_bucketize_pos) { + const std::optional>& block_bucketize_pos) { Tensor new_lengths; Tensor new_indices; - c10::optional new_weights; - c10::optional new_pos; - c10::optional unbucketize_permute; + std::optional new_weights; + std::optional new_pos; + std::optional unbucketize_permute; std::tie( new_lengths, new_indices, @@ -923,10 +923,10 @@ block_bucketize_sparse_features_cuda( DLL_PUBLIC std::tuple< Tensor, Tensor, - c10::optional, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional, + std::optional> block_bucketize_sparse_features_inference_cuda( const Tensor& lengths, const Tensor& indices, @@ -934,10 +934,10 @@ block_bucketize_sparse_features_inference_cuda( const bool sequence, const Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t max_B, - const c10::optional>& block_bucketize_pos, + const std::optional>& block_bucketize_pos, const bool return_bucket_mapping) { return _block_bucketize_sparse_features_cuda( lengths, diff --git a/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu index 0e8340387..9f90d76ca 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu @@ -84,13 +84,13 @@ __launch_bounds__(kMaxThreads) void _bucketize_sparse_features_cuda_kernel2( // This function partitions sparse features // cyclically along the sparse dimension into my_size blocks DLL_PUBLIC -std::tuple, c10::optional> +std::tuple, std::optional> bucketize_sparse_features_cuda( const Tensor& lengths, const Tensor& indices, const bool bucketize_pos, const int64_t my_size, - const c10::optional& weights) { + const std::optional& weights) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices); CUDA_DEVICE_GUARD(lengths); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp index c39267a2c..01843a26c 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp @@ -278,18 +278,18 @@ template < void _block_bucketize_sparse_features_cpu_kernel( const Tensor& lengths, const Tensor& indices, - const c10::optional& weights, + const std::optional& weights, const bool bucketize_pos, const Tensor& block_sizes, const int64_t my_size, Tensor new_lengths, Tensor new_indices, - c10::optional new_weights, - c10::optional new_pos, - const c10::optional& unbucketize_permute, - const c10::optional& batch_size_per_feature, - const c10::optional>& block_bucketize_pos, - const c10::optional& bucket_mapping) { + std::optional new_weights, + std::optional new_pos, + const std::optional& unbucketize_permute, + const std::optional& batch_size_per_feature, + const std::optional>& block_bucketize_pos, + const std::optional& bucket_mapping) { // allocate tensors and buffers const auto lengths_size = lengths.numel(); const auto new_lengths_size = lengths_size * my_size; @@ -503,13 +503,13 @@ template void _bucketize_sparse_features_cpu( const at::Tensor& lengths, const at::Tensor& indices, - const c10::optional& weights, + const std::optional& weights, const bool bucketize_pos, const int64_t my_size, at::Tensor& new_lengths, at::Tensor& new_indices, - c10::optional new_weights, - c10::optional new_pos) { + std::optional new_weights, + std::optional new_pos) { TENSOR_ON_CPU(lengths); TENSOR_ON_CPU(indices); TENSOR_EMPTY_OR_ON_CPU(weights); @@ -581,12 +581,12 @@ void _bucketize_sparse_features_cpu( } } -std::tuple> permute_2D_sparse_data_cpu( +std::tuple> permute_2D_sparse_data_cpu( const Tensor& permute, const Tensor& lengths, const Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum) { + const std::optional& weights, + const std::optional& permuted_lengths_sum) { TENSOR_ON_CPU(permute); TENSOR_ON_CPU(lengths); TENSOR_ON_CPU(indices); @@ -605,7 +605,7 @@ std::tuple> permute_2D_sparse_data_cpu( Tensor permuted_lengths; Tensor permuted_indices; - c10::optional permuted_weights; + std::optional permuted_weights; permuted_lengths = at::empty({T, B}, lengths.options()); @@ -753,12 +753,12 @@ void _permute_1D_indices_weights_kernel_cpu( }); // parallel_for T x B, different B across T } -std::tuple> permute_1D_sparse_data_cpu( +std::tuple> permute_1D_sparse_data_cpu( const Tensor& permute, const Tensor& lengths, const Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum) { + const std::optional& weights, + const std::optional& permuted_lengths_sum) { TENSOR_ON_CPU(permute); TENSOR_ON_CPU(lengths); TENSOR_ON_CPU(indices); @@ -990,10 +990,10 @@ Tensor populate_bucketized_permute_cpu( std::tuple< Tensor, Tensor, - c10::optional, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional, + std::optional> _block_bucketize_sparse_features_cpu( const Tensor& lengths, const Tensor& indices, @@ -1001,10 +1001,10 @@ _block_bucketize_sparse_features_cpu( const bool sequence, const Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t /* max_batch_size */, // Only used in GPU variant - const c10::optional>& block_bucketize_pos, + const std::optional>& block_bucketize_pos, const bool return_bucket_mapping) { const auto lengths_size = lengths.numel(); const auto new_lengths_size = lengths_size * my_size; @@ -1141,9 +1141,9 @@ _block_bucketize_sparse_features_cpu( std::tuple< Tensor, Tensor, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional> block_bucketize_sparse_features_cpu( const Tensor& lengths, const Tensor& indices, @@ -1151,15 +1151,15 @@ block_bucketize_sparse_features_cpu( const bool sequence, const Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t /* max_batch_size */, // Only used in GPU variant - const c10::optional>& block_bucketize_pos) { + const std::optional>& block_bucketize_pos) { Tensor new_lengths; Tensor new_indices; - c10::optional new_weights; - c10::optional new_pos; - c10::optional unbucketize_permute; + std::optional new_weights; + std::optional new_pos; + std::optional unbucketize_permute; std::tie( new_lengths, new_indices, @@ -1185,10 +1185,10 @@ block_bucketize_sparse_features_cpu( std::tuple< Tensor, Tensor, - c10::optional, - c10::optional, - c10::optional, - c10::optional> + std::optional, + std::optional, + std::optional, + std::optional> block_bucketize_sparse_features_inference_cpu( const Tensor& lengths, const Tensor& indices, @@ -1196,10 +1196,10 @@ block_bucketize_sparse_features_inference_cpu( const bool sequence, const Tensor& block_sizes, const int64_t my_size, - const c10::optional& weights, - const c10::optional& batch_size_per_feature, + const std::optional& weights, + const std::optional& batch_size_per_feature, const int64_t /* max_batch_size */, // Only used in GPU variant - const c10::optional>& block_bucketize_pos, + const std::optional>& block_bucketize_pos, const bool return_bucket_mapping) { return _block_bucketize_sparse_features_cpu( lengths, @@ -1220,14 +1220,14 @@ block_bucketize_sparse_features_inference_cpu( std::tuple< at::Tensor, at::Tensor, - c10::optional, - c10::optional> + std::optional, + std::optional> bucketize_sparse_features_cpu( const at::Tensor& lengths, const at::Tensor& indices, const bool bucketize_pos, const int64_t my_size, - const c10::optional& weights) { + const std::optional& weights) { TENSOR_ON_CPU(lengths); TENSOR_ON_CPU(indices); TENSOR_ON_CPU(weights); @@ -2351,7 +2351,7 @@ std::tuple embedding_bag_rowwise_prune( at::ScalarType compressed_indices_dtype, const bool abs, const int64_t min_non_pruned_rows, - const c10::optional& min_save_ratio) { + const std::optional& min_save_ratio) { TENSOR_ON_CPU(weights); TENSOR_ON_CPU(indicator); TENSOR_NDIM_EQUALS(weights, 2); @@ -2403,7 +2403,7 @@ std::tuple embedding_bag_rowwise_prune( Tensor& lengths_range_out( Tensor& output, const Tensor& t_in, - const c10::optional>& shape) { + const std::optional>& shape) { TENSOR_ON_CPU(t_in); TENSOR_NDIM_EQUALS(t_in, 1); @@ -2449,7 +2449,7 @@ Tensor& lengths_range_out( Tensor lengths_range( const Tensor& t_in, - const c10::optional>& shape) { + const std::optional>& shape) { auto output = at::empty({0}, t_in.options()); return lengths_range_out(output, t_in, shape); } @@ -2492,11 +2492,11 @@ void _permute_data_kernel_cpu( }); // parallel_for T * B } -std::tuple> permute_sparse_features_cpu( +std::tuple> permute_sparse_features_cpu( const Tensor& permute, const Tensor& lengths, const Tensor& indices, - const c10::optional& weights) { + const std::optional& weights) { TENSOR_ON_CPU(permute); TENSOR_ON_CPU(lengths); TENSOR_ON_CPU(indices); @@ -2755,8 +2755,8 @@ std::tuple permute_sequence_embeddings_cpu( Tensor permuted_lengths; Tensor permuted_embeddings; - c10::optional weights_dummy; - c10::optional permuted_lengths_sum_dummy; + std::optional weights_dummy; + std::optional permuted_lengths_sum_dummy; const auto T = permute.numel(); const auto B = lengths.size(1); @@ -2922,9 +2922,9 @@ namespace { Tensor index_select_dim0( const Tensor& input, const Tensor& indices, - c10::optional /*consecutive_range_start*/, - c10::optional /*consecutive_range_length*/, - c10::optional /*skip_indices_sorting_fwd*/) { + std::optional /*consecutive_range_start*/, + std::optional /*consecutive_range_length*/, + std::optional /*skip_indices_sorting_fwd*/) { return at::index_select(input, 0, indices); } diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp index c998c6d5e..501853b84 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp @@ -631,9 +631,9 @@ Tensor pack_segments_cuda( Tensor index_select_dim0_gpu( const Tensor& input, const Tensor& indices, - c10::optional consecutive_range_start, - c10::optional consecutive_range_length, - c10::optional skip_indices_sorting_fwd) { + std::optional consecutive_range_start, + std::optional consecutive_range_length, + std::optional skip_indices_sorting_fwd) { bool user_skip_indices_sorting_fwd = skip_indices_sorting_fwd ? *skip_indices_sorting_fwd : false; return IndexSelectDim0GPUOp::apply( diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu index 5ba0ed1d3..fbd54fd3a 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu @@ -61,13 +61,13 @@ __global__ __launch_bounds__(kMaxThreads) void permute_1D_data_kernel( } } -DLL_PUBLIC std::tuple> +DLL_PUBLIC std::tuple> permute_1D_sparse_data_cuda( const Tensor& permute, const Tensor& lengths, const Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum) { + const std::optional& weights, + const std::optional& permuted_lengths_sum) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(permute, lengths, indices, weights); CUDA_DEVICE_GUARD(indices); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu index 2c56de9a0..ba777432c 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu @@ -67,13 +67,13 @@ __global__ __launch_bounds__(kMaxThreads) void permute_2D_lengths_kernel( } } -DLL_PUBLIC std::tuple> +DLL_PUBLIC std::tuple> permute_2D_sparse_data_cuda( const Tensor& permute, const Tensor& lengths, const Tensor& indices, - const c10::optional& weights, - const c10::optional& permuted_lengths_sum) { + const std::optional& weights, + const std::optional& permuted_lengths_sum) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(permute, lengths, indices, weights); TORCH_CHECK(lengths.dim() == 2); @@ -228,12 +228,12 @@ __global__ __launch_bounds__(kMaxThreads) void permute_indices_weights_kernel( } } -DLL_PUBLIC std::tuple> +DLL_PUBLIC std::tuple> permute_sparse_features_cuda( const Tensor& permute, const Tensor& lengths, const Tensor& indices, - const c10::optional& weights) { + const std::optional& weights) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(permute, lengths, indices, weights); CUDA_DEVICE_GUARD(indices); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_permute_embeddings.cu b/fbgemm_gpu/src/sparse_ops/sparse_permute_embeddings.cu index 3664d02ce..8e8d56b62 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_permute_embeddings.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_permute_embeddings.cu @@ -61,8 +61,8 @@ DLL_PUBLIC std::tuple permute_sequence_embeddings_cuda( Tensor permuted_lengths; Tensor permuted_embeddings; - c10::optional weights_dummy; - c10::optional permuted_lengths_sum_dummy; + std::optional weights_dummy; + std::optional permuted_lengths_sum_dummy; const auto T = permute.numel(); const auto B = lengths.size(1); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_range.cu b/fbgemm_gpu/src/sparse_ops/sparse_range.cu index f747acb65..9e2d7a725 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_range.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_range.cu @@ -108,7 +108,7 @@ offsets_range_cuda(const Tensor& offsets, int64_t range_size) { DLL_PUBLIC Tensor lengths_range_cuda( const Tensor& t_in, - const c10::optional>& shape) { + const std::optional>& shape) { TENSOR_ON_CUDA_GPU(t_in); TENSOR_NDIM_EQUALS(t_in, 1); diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.h b/fbgemm_gpu/src/split_embeddings_cache/common.h index ea8e48116..814fc05a7 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/common.h +++ b/fbgemm_gpu/src/split_embeddings_cache/common.h @@ -29,7 +29,7 @@ Tensor linearize_cache_indices_cpu( const Tensor& cache_hash_size_cumsum, const Tensor& indices, const Tensor& offsets, - const c10::optional& B_offsets, + const std::optional& B_offsets, const int64_t max_B, const int64_t indices_base_offset); @@ -53,7 +53,7 @@ void lru_cache_populate_byte_cpu( Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats); + std::optional uvm_cache_stats); void direct_mapped_lru_cache_populate_byte_cpu( Tensor weights, @@ -71,7 +71,7 @@ void direct_mapped_lru_cache_populate_byte_cpu( Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats); + std::optional uvm_cache_stats); void lfu_cache_populate_byte_cpu( Tensor weights, @@ -92,15 +92,15 @@ Tensor lxu_cache_lookup_cpu( Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, - c10::optional uvm_cache_stats, - c10::optional num_uniq_cache_indices, - c10::optional lxu_cache_locations_output); + std::optional uvm_cache_stats, + std::optional num_uniq_cache_indices, + std::optional lxu_cache_locations_output); Tensor direct_mapped_lxu_cache_lookup_cpu( Tensor linear_cache_indices, Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, - c10::optional uvm_cache_stats); + std::optional uvm_cache_stats); } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp index 34d7cafcc..2c174f3e2 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp @@ -16,7 +16,7 @@ DLL_PUBLIC Tensor linearize_cache_indices_cpu( const Tensor& /*cache_hash_size_cumsum*/, const Tensor& indices, const Tensor& /*offsets*/, - const c10::optional& /*B_offsets*/, + const std::optional& /*B_offsets*/, const int64_t /*max_B*/, const int64_t /*indices_base_offset*/) { return at::empty_like(indices); diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu index b391d4472..e2911f127 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu @@ -62,7 +62,7 @@ DLL_PUBLIC Tensor linearize_cache_indices_cuda( const Tensor& cache_hash_size_cumsum, const Tensor& indices, const Tensor& offsets, - const c10::optional& B_offsets, + const std::optional& B_offsets, const int64_t max_B, const int64_t indices_base_offset) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( @@ -201,7 +201,7 @@ DLL_PUBLIC Tensor linearize_cache_indices_from_row_idx_cuda( } DLL_PUBLIC -std::tuple, c10::optional> +std::tuple, std::optional> get_unique_indices_cuda_impl( const Tensor& linear_indices, const int64_t max_indices, @@ -217,8 +217,8 @@ get_unique_indices_cuda_impl( auto unique_indices = at::empty_like(linear_indices); auto unique_indices_length = at::empty({1}, linear_indices.options().dtype(at::kInt)); - c10::optional unique_indices_count = c10::nullopt; - c10::optional linear_index_positions_sorted = c10::nullopt; + std::optional unique_indices_count = c10::nullopt; + std::optional linear_index_positions_sorted = c10::nullopt; Tensor linear_index_positions; if (compute_inverse_indices) { @@ -329,7 +329,7 @@ get_unique_indices_cuda_impl( } DLL_PUBLIC -std::tuple> get_unique_indices_cuda( +std::tuple> get_unique_indices_cuda( const Tensor& linear_indices, const int64_t max_indices, const bool compute_count) { @@ -343,7 +343,7 @@ std::tuple> get_unique_indices_cuda( } DLL_PUBLIC -std::tuple, c10::optional> +std::tuple, std::optional> get_unique_indices_with_inverse_cuda( const Tensor& linear_indices, const int64_t max_indices, diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu index f4448e81c..31934ae5b 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu @@ -153,7 +153,7 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel( } // namespace -DLL_PUBLIC std::tuple> +DLL_PUBLIC std::tuple> lru_cache_find_uncached_cuda( Tensor unique_indices, Tensor unique_indices_length, @@ -186,7 +186,7 @@ lru_cache_find_uncached_cuda( auto cache_set_sorted_unique_indices = empty_like(unique_indices); Tensor cache_sets_positions; - c10::optional cache_set_inverse_indices = c10::nullopt; + std::optional cache_set_inverse_indices = c10::nullopt; if (compute_inverse_indices) { TORCH_CHECK( cache_sets.numel() <= diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu index 0cf8586f2..2bb8baf2e 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu @@ -286,9 +286,9 @@ DLL_PUBLIC void lru_cache_populate_cuda( Tensor lru_state, const bool stochastic_rounding, bool gather_cache_stats, - c10::optional uvm_cache_stats, + std::optional uvm_cache_stats, bool lock_cache_line, - c10::optional lxu_cache_locking_counter) { + std::optional lxu_cache_locking_counter) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( weights, cache_hash_size_cumsum, diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp index a7b2bf8b5..0663f4947 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp @@ -27,7 +27,7 @@ DLL_PUBLIC void lru_cache_populate_byte_cpu( Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats) { + std::optional uvm_cache_stats) { return; } @@ -47,7 +47,7 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cpu( Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats) { + std::optional uvm_cache_stats) { return; } diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu index e884aa489..e52af82bb 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu @@ -519,7 +519,7 @@ DLL_PUBLIC void lru_cache_populate_byte_cuda( Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats) { + std::optional uvm_cache_stats) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( weights, cache_hash_size_cumsum, @@ -611,7 +611,7 @@ DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda( Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, - c10::optional uvm_cache_stats) { + std::optional uvm_cache_stats) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( weights, cache_hash_size_cumsum, diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp index 296f96641..d5f6a28ce 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp +++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp @@ -17,9 +17,9 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cpu( Tensor /* lxu_cache_state */, int64_t /* invalid_index */, bool /* gather_cache_stats */, - c10::optional /* uvm_cache_stats */, - c10::optional /* num_uniq_cache_indices */, - c10::optional lxu_cache_locations_output) { + std::optional /* uvm_cache_stats */, + std::optional /* num_uniq_cache_indices */, + std::optional lxu_cache_locations_output) { return lxu_cache_locations_output.value_or(empty_like( linear_cache_indices, linear_cache_indices.options().dtype(at::kInt))); } @@ -29,7 +29,7 @@ DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cpu( Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, - c10::optional uvm_cache_stats) { + std::optional uvm_cache_stats) { return empty_like( linear_cache_indices, linear_cache_indices.options().dtype(at::kInt)); } diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu index 266bf7ed6..8f7222e1d 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu @@ -410,9 +410,9 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cuda( const Tensor lxu_cache_state, const int64_t invalid_index, const bool gather_cache_stats, - const c10::optional uvm_cache_stats, - const c10::optional num_uniq_cache_indices, - const c10::optional lxu_cache_locations_output) { + const std::optional uvm_cache_stats, + const std::optional num_uniq_cache_indices, + const std::optional lxu_cache_locations_output) { const auto uniq_lookup = num_uniq_cache_indices.has_value(); // TODO: Support gather_cache_stats=true when uniq_lookup=true TORCH_CHECK( @@ -472,7 +472,7 @@ DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda( Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, - c10::optional uvm_cache_stats) { + std::optional uvm_cache_stats) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( linear_cache_indices, lxu_cache_state); TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(uvm_cache_stats, lxu_cache_state); @@ -543,7 +543,7 @@ __launch_bounds__(kMaxThreads) void lxu_cache_locations_update_kernel( DLL_PUBLIC void lxu_cache_locations_update_cuda( Tensor lxu_cache_locations, Tensor lxu_cache_locations_new, - c10::optional num_uniq_cache_indices) { + std::optional num_uniq_cache_indices) { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( lxu_cache_locations, lxu_cache_locations_new, num_uniq_cache_indices); diff --git a/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu index 2232b3240..1d502332d 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu @@ -276,8 +276,8 @@ DLL_PUBLIC void reset_weight_momentum_cuda( total_cache_hash_size, false, // gather_cache_stats uvm_cache_stats, - c10::optional(), // num_uniq_cache_indices - c10::optional() // lxu_cache_locations_output + std::optional(), // num_uniq_cache_indices + std::optional() // lxu_cache_locations_output ); } diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu index eb5ad699a..ccebf2087 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu @@ -208,12 +208,12 @@ transpose_embedding_input( Tensor indices, Tensor offsets, bool nobag, - const c10::optional& vbe_b_t_map, + const std::optional& vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask, const int64_t total_unique_indices, const bool is_index_select, - const c10::optional& total_L_offsets, + const std::optional& total_L_offsets, const int64_t fixed_L_per_warp, const int64_t num_warps_per_feature) { const bool vbe = vbe_b_t_map.has_value(); diff --git a/fbgemm_gpu/test/sparse/utils_test.cpp b/fbgemm_gpu/test/sparse/utils_test.cpp index 942e26c63..7545d728c 100644 --- a/fbgemm_gpu/test/sparse/utils_test.cpp +++ b/fbgemm_gpu/test/sparse/utils_test.cpp @@ -57,7 +57,7 @@ TEST(SparseOpsUtilsTest, gpu_tensors_pass) { TEST(SparseOpsUtilsTest, optional_tensor_passes) { const auto ten1 = get_valid_cpu_tensor().cuda(); - const c10::optional ten2; + const std::optional ten2; const auto func = [&]() { TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(ten1, ten2); };