Skip to content

Commit

Permalink
Merge pull request #49 from ROCmSoftwarePlatform/IFU-2023-12-06
Browse files Browse the repository at this point in the history
Ifu 2023 12 06
  • Loading branch information
liligwu authored Dec 7, 2023
2 parents 06879b1 + 5a2c43e commit 61a7e50
Show file tree
Hide file tree
Showing 100 changed files with 2,848 additions and 963 deletions.
1 change: 1 addition & 0 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ build_fbgemm_gpu_package () {
--package_name="${package_name}" \
--python-tag="${python_tag}" \
--plat-name="${plat_name}" \
--verbose \
"${build_args[@]}"

# Run checks on the built libraries
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/fbgemm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:

steps:
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -127,7 +127,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -159,7 +159,7 @@ jobs:

steps:
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -126,7 +126,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -191,7 +191,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_cpu_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -136,7 +136,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_cpu_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -133,7 +133,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/fbgemm_gpu_cuda_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down Expand Up @@ -140,6 +140,7 @@ jobs:
needs: build_artifact

steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
- name: Checkout the Repository
uses: actions/checkout@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_cuda_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils rsync sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
submodules: true

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:

steps:
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which

- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info
Expand Down Expand Up @@ -116,6 +116,7 @@ jobs:
cuda-version-publish: [ "11.8.0" ]

steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
- name: Checkout the Repository
uses: actions/checkout@v3

Expand Down Expand Up @@ -182,7 +183,7 @@ jobs:
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Display System Info
run: . $PRELUDE; print_system_info
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
# found in:
# https://github.com/github/gitignore/

# General
.DS_Store
*~

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
53 changes: 34 additions & 19 deletions fbgemm_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,22 @@ else()
DEPENDS "${optimizer_codegen_dependencies}")
endif()

set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
# NVCC in WSL complains about unknown -mavx options
# https://github.com/pytorch/FBGEMM/issues/2135
set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
endif()

set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
endif()

if(CXX_AVX2_FOUND)
set_source_files_properties(${gen_cpu_source_files}
PROPERTIES COMPILE_OPTIONS
"-mavx2;-mf16c;-mfma;-fopenmp")
"${AVX2_FLAGS}")
else()
set_source_files_properties(${gen_cpu_source_files}
PROPERTIES COMPILE_OPTIONS
Expand Down Expand Up @@ -504,13 +516,13 @@ set(fbgemm_sources_avx512
if(CXX_AVX2_FOUND)
set_source_files_properties(${fbgemm_sources_avx2}
PROPERTIES COMPILE_OPTIONS
"-mavx2;-mf16c;-mfma")
"${AVX2_FLAGS}")
endif()

if(CXX_AVX512_FOUND)
set_source_files_properties(${fbgemm_sources_avx512}
PROPERTIES COMPILE_OPTIONS
"-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
"${AVX512_FLAGS}")
endif()

set(fbgemm_sources ${fbgemm_sources_normal})
Expand Down Expand Up @@ -561,19 +573,20 @@ set(fbgemm_gpu_sources_static_cpu
codegen/embedding_forward_quantized_host_cpu.cpp
codegen/embedding_backward_dense_host_cpu.cpp
codegen/embedding_bounds_check_host_cpu.cpp
src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
src/input_combine_cpu.cpp
src/layout_transform_ops_cpu.cpp
src/input_combine_ops/input_combine_cpu.cpp
src/layout_transform_ops/layout_transform_ops_cpu.cpp
src/quantize_ops/quantize_ops_cpu.cpp
src/quantize_ops/quantize_ops_meta.cpp
src/sparse_ops/sparse_ops_cpu.cpp
src/sparse_ops/sparse_ops_meta.cpp
src/embedding_inplace_update_cpu.cpp
src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
src/split_embeddings_cache/linearize_cache_indices.cpp
src/split_embeddings_cache/lfu_cache_populate_byte.cpp
src/split_embeddings_cache/lru_cache_populate_byte.cpp
Expand All @@ -588,16 +601,16 @@ if(NOT FBGEMM_CPU_ONLY)
codegen/embedding_bounds_check_host.cpp
src/memory_utils/memory_utils.cpp
src/memory_utils/memory_utils_ops.cpp
src/layout_transform_ops_gpu.cpp
src/layout_transform_ops/layout_transform_ops_gpu.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
src/quantize_ops/quantize_ops_gpu.cpp
src/sparse_ops/sparse_ops_gpu.cpp
src/split_embeddings_utils.cpp
src/split_embeddings_utils/split_embeddings_utils.cpp
src/split_embeddings_cache/split_embeddings_cache_ops.cu
src/metric_ops_host.cpp
src/embedding_inplace_update_gpu.cpp
src/input_combine_gpu.cpp
src/metric_ops/metric_ops_host.cpp
src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
src/input_combine_ops/input_combine_gpu.cpp
codegen/batch_index_select_dim0_host.cpp)

if(NVML_LIB_PATH)
Expand All @@ -607,8 +620,7 @@ if(NOT FBGEMM_CPU_ONLY)
if(NVML_LIB_PATH OR USE_ROCM)
message(STATUS "Adding merge_pooled_embeddings sources")
list(APPEND fbgemm_gpu_sources_static_cpu
src/merge_pooled_embeddings_cpu.cpp
src/merge_pooled_embeddings_gpu.cpp
src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp
src/topology_utils.cpp)
else()
message(STATUS "Skipping merge_pooled_embeddings sources")
Expand All @@ -618,7 +630,7 @@ endif()
if(CXX_AVX2_FOUND)
set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
PROPERTIES COMPILE_OPTIONS
"-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
"${AVX2_FLAGS}")
else()
set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
PROPERTIES COMPILE_OPTIONS
Expand All @@ -631,9 +643,9 @@ if(NOT FBGEMM_CPU_ONLY)
codegen/embedding_forward_quantized_split_lookup.cu
src/memory_utils/memory_utils.cu
src/memory_utils/memory_utils_ops.cu
src/embedding_inplace_update.cu
src/embedding_inplace_ops/embedding_inplace_update.cu
src/histogram_binning_calibration_ops.cu
src/input_combine.cu
src/input_combine_ops/input_combine.cu
src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
src/jagged_tensor_ops/dense_to_jagged_forward.cu
Expand All @@ -651,8 +663,8 @@ if(NOT FBGEMM_CPU_ONLY)
src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
src/jagged_tensor_ops/jagged_unique_indices.cu
src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
src/layout_transform_ops.cu
src/metric_ops.cu
src/layout_transform_ops/layout_transform_ops.cu
src/metric_ops/metric_ops.cu
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
src/quantize_ops/quantize_bfloat16.cu
Expand Down Expand Up @@ -691,7 +703,10 @@ if(NOT FBGEMM_CPU_ONLY)
src/split_embeddings_cache/lxu_cache.cu
src/split_embeddings_cache/linearize_cache_indices.cu
src/split_embeddings_cache/reset_weight_momentum.cu
src/split_embeddings_utils.cu)
src/split_embeddings_utils/generate_vbe_metadata.cu
src/split_embeddings_utils/get_infos_metadata.cu
src/split_embeddings_utils/radix_sort_pairs.cu
src/split_embeddings_utils/transpose_embedding_input.cu)

set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
PROPERTIES COMPILE_OPTIONS
Expand Down
7 changes: 5 additions & 2 deletions fbgemm_gpu/bench/batched_unary_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-unsafe
import functools
from math import sqrt
from typing import List, Tuple
Expand All @@ -29,7 +28,10 @@


def generate_unary_feature(
batch_size: int, num_embeddings: int
batch_size: int,
num_embeddings: int
# pyre-fixme[24]: Generic type `list` expects 1 type parameter, use
# `typing.List[<element type>]` to avoid runtime subscripting errors.
) -> Tuple[List, List, List]:
lengths = []
offsets = []
Expand Down Expand Up @@ -90,6 +92,7 @@ def forward(
@click.option("--num-tables", default=2)
@click.option("--num-tasks", default=3)
@click.option("--repeats", default=100)
# pyre-fixme[2]: Parameter must be annotated.
def main(batch_size, num_tables, num_tasks, repeats) -> None:
device = torch.device("cuda", 0)
torch.cuda.set_device(device)
Expand Down
10 changes: 7 additions & 3 deletions fbgemm_gpu/bench/bench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@ def benchmark_torch_function( # noqa: C901
copy_f_for_multi_thread_test: bool = False,
) -> Tuple[float, torch.Tensor]:
logging.info(f"Start to benchmark {name}...")
if device != "" and device != "cuda":
if device != "cpu" and device != "" and device != "cuda":
torch.cuda.set_device(device)
for _ in range(num_warmups):
output = f(*args)

assert num_threads > 0
if torch.cuda.is_available() and (num_threads == 1):
if device != "cpu" and torch.cuda.is_available() and (num_threads == 1):
cache = torch.empty(
int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
dtype=torch.float,
Expand All @@ -69,7 +69,7 @@ def benchmark_torch_function( # noqa: C901
[s.elapsed_time(e) for s, e in zip(start_event, end_event)]
)
elapsed_time = torch.mean(times).item() * 1.0e-3
elif torch.cuda.is_available() and (num_threads > 1):
elif device != "cpu" and torch.cuda.is_available() and (num_threads > 1):
cache = torch.empty(
int(flush_gpu_cache_size_mb * 1024 * 1024 // 4),
dtype=torch.float,
Expand Down Expand Up @@ -156,6 +156,10 @@ def benchmark_requests(
) -> float:
times = []

# Run at least one warmup iteration to avoid the long cudaLaunchKernel time
# for the first kernel
num_warmups = num_warmups + 1 if num_warmups >= 0 else 1

if num_warmups > 0:
indices, offsets, weights = requests[0]
for _ in range(num_warmups):
Expand Down
Loading

0 comments on commit 61a7e50

Please sign in to comment.