Skip to content

Commit

Permalink
Add cache conflict miss support (#2596)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #2596

Prior to this diff, SSD TBE lacked support for the conflict cache miss
scenario. It operated under the assumption that the cache, located in
GPU memory, was sufficiently large to hold all prefetched data from
SSD. In the event of a conflict cache miss, the behavior of SSD TBE
would be unpredictable (it could either fail or potentially access
illegal memory). Note that a conflict cache miss happens when an
embedding row is absent in the cache, and after being fetched from
SSD, it cannot be inserted into the cache due to capacity constraints
or associativity limitations.

This diff introduces support for conflict cache misses by storing rows
that cannot be inserted into the cache due to conflicts in a scratch
pad, which is a temporary GPU tensor. In the case where rows are
missed from the cache, TBE kernels can access the scratch pad.

Prior to this diff, during the SSD prefetch stage, any row that was
missed the cache and required fetching from SSD would be first fetched
into a CPU scratch pad and then transferred to GPU. Rows that could be
inserted into the cache would subsequently be copied from the GPU
scratch pad into the cache. If conflict misses occurred, the prefetch
behavior would be unpredictable. With this diff, conflict missed rows
are now retained in the scratch pad, which is kept alive until the
current iteration completes.  Throughout the forward and backward +
optimizer stages of TBE, both the cache and scratch pad are equivalent
in terms of usage. However, following the completion of the backward +
optimizer step, rows in the scratch pad are flushed back to SSD,
unlike rows residing in the cache which are not evicted for future
usage (see the diagram below for more details).

 {F1645878181}

Differential Revision: D55998215
  • Loading branch information
sryap authored and facebook-github-bot committed May 22, 2024
1 parent d7a5500 commit 3d84b25
Show file tree
Hide file tree
Showing 23 changed files with 1,157 additions and 490 deletions.
52 changes: 44 additions & 8 deletions fbgemm_gpu/FbgemmGpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ set(GWD_OPTIMIZERS
set(DEFUSED_OPTIMIZERS
rowwise_adagrad)

# Optimizers with the SSD support
set(SSD_OPTIMIZERS
rowwise_adagrad)

set(WEIGHT_OPTIONS
weighted
unweighted_nobag
Expand Down Expand Up @@ -143,6 +147,7 @@ set(gen_gpu_kernel_source_files
"gen_embedding_forward_split_unweighted_codegen_cuda.cu"
"gen_embedding_backward_dense_indice_weights_codegen_cuda.cu"
"gen_embedding_backward_split_indice_weights_codegen_cuda.cu"
"gen_embedding_backward_ssd_indice_weights_codegen_cuda.cu"
"gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu"
"gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu"
"gen_batch_index_select_dim0_forward_codegen_cuda.cu"
Expand All @@ -153,10 +158,13 @@ set(gen_gpu_kernel_source_files
"gen_batch_index_select_dim0_backward_kernel_warp.cu"
"gen_embedding_backward_split_grad_embedding_ops.cu"
"gen_embedding_backward_split_grad_index_select.cu"
"gen_embedding_backward_common_split_device_kernel.cuh"
"gen_embedding_backward_batch_index_select_split_device_kernel.cuh"
"gen_embedding_backward_split_common_device_kernel.cuh"
"gen_embedding_backward_split_batch_index_select_device_kernel.cuh"
"gen_embedding_forward_split_weighted_gwd_codegen_cuda.cu"
"gen_embedding_forward_split_unweighted_gwd_codegen_cuda.cu"
"gen_embedding_forward_ssd_weighted_codegen_cuda.cu"
"gen_embedding_forward_ssd_unweighted_codegen_cuda.cu"
"gen_embedding_forward_ssd_unweighted_nobag_kernel_small.cu"
)

if(NOT USE_ROCM)
Expand All @@ -179,7 +187,8 @@ foreach(wdesc ${WEIGHT_OPTIONS})
"gen_embedding_backward_dense_split_${wdesc}_kernel_cta.cu"
"gen_embedding_backward_dense_split_${wdesc}_kernel_warp.cu"
"gen_embedding_forward_split_${wdesc}_kernel.cu"
"gen_embedding_backward_${wdesc}_split_device_kernel.cuh")
"gen_embedding_forward_ssd_${wdesc}_kernel.cu"
"gen_embedding_backward_split_${wdesc}_device_kernel.cuh")

foreach(etype fp32 fp16 fp8 int8 int4 int2)
list(APPEND gen_gpu_kernel_source_files
Expand All @@ -191,7 +200,7 @@ endforeach()
foreach(wdesc weighted unweighted)
list(APPEND gen_gpu_kernel_source_files
"gen_embedding_forward_split_${wdesc}_vbe_kernel.cu"
"gen_embedding_backward_${wdesc}_vbe_split_device_kernel.cuh")
"gen_embedding_backward_split_${wdesc}_vbe_device_kernel.cuh")
endforeach()

# Generate GWD files
Expand All @@ -207,22 +216,31 @@ set(gen_cpu_source_files

set(gen_python_source_files
${CMAKE_BINARY_DIR}/__init__.py
${CMAKE_BINARY_DIR}/lookup_args.py)
${CMAKE_BINARY_DIR}/lookup_args.py
${CMAKE_BINARY_DIR}/lookup_args_ssd.py
)

# For each of the optimizers, generate the backward split variant by adding
# the Python, CPU-only, GPU host, and GPU kernel source files

# Generate the Python functions only if there is the backend support
# Generate the Python functions only if there is the backend support (for all
# optimizers)
foreach(optimizer
${COMMON_OPTIMIZERS}
${CPU_ONLY_OPTIMIZERS}
${GPU_ONLY_OPTIMIZERS})
list(APPEND gen_python_source_files
"${CMAKE_BINARY_DIR}/lookup_${optimizer}.py")
list(APPEND gen_python_source_files
"${CMAKE_BINARY_DIR}/lookup_${optimizer}.py"
"${CMAKE_BINARY_DIR}/lookup_${optimizer}_pt2.py")
endforeach()

# Generate the Python functions only if there is the backend support (for SSD
# optimizers)
foreach(optimizer ${SSD_OPTIMIZERS})
list(APPEND gen_python_source_files
"${CMAKE_BINARY_DIR}/lookup_${optimizer}_ssd.py")
endforeach()

# Generate the backend API for all optimizers to preserve the backward
# compatibility
list(APPEND gen_cpu_source_files
Expand Down Expand Up @@ -285,6 +303,24 @@ foreach(optimizer ${DEFUSED_OPTIMIZERS})
"${CMAKE_BINARY_DIR}/split_embedding_optimizer_${optimizer}.py")
endforeach()

foreach(optimizer ${SSD_OPTIMIZERS})
list(APPEND gen_gpu_kernel_source_files
"gen_embedding_optimizer_${optimizer}_ssd_device_kernel.cuh"
)

list(APPEND gen_gpu_host_source_files
"gen_embedding_backward_ssd_${optimizer}.cpp"
)

foreach(wdesc weighted unweighted unweighted_nobag)
list(APPEND gen_gpu_kernel_source_files
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_cuda.cu"
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_kernel_cta.cu"
"gen_embedding_backward_${optimizer}_ssd_${wdesc}_kernel_warp.cu")
endforeach()

endforeach()

list(APPEND gen_defused_optim_py_files
${CMAKE_BINARY_DIR}/optimizer_args.py)

Expand Down
Loading

0 comments on commit 3d84b25

Please sign in to comment.