Skip to content

Commit

Permalink
Merge pull request #58 from ROCm/IFU-2024-04-19
Browse files Browse the repository at this point in the history
Ifu 2024 04 19
  • Loading branch information
liligwu authored Apr 19, 2024
2 parents 7ec3b82 + 6ed2049 commit a821cf4
Show file tree
Hide file tree
Showing 116 changed files with 2,750 additions and 2,928 deletions.
77 changes: 24 additions & 53 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ __configure_fbgemm_gpu_build_clang () {
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
# shellcheck disable=SC2206
build_args+=(
--cxxprefix ${conda_prefix}
--cxxprefix=${conda_prefix}
)
}

Expand Down Expand Up @@ -258,6 +258,11 @@ __configure_fbgemm_gpu_build () {
__configure_fbgemm_gpu_build_clang
fi

# Set verbosity
build_args+=(
--verbose
)

# shellcheck disable=SC2145
echo "[BUILD] FBGEMM_GPU build arguments have been set: ${build_args[@]}"
}
Expand Down Expand Up @@ -307,7 +312,7 @@ __build_fbgemm_gpu_set_run_multicore () {
export run_multicore=""
if [[ $core =~ $re && $sockets =~ $re ]] ; then
local n_core=$((core * sockets))
export run_multicore=" -j ${n_core}"
export run_multicore="-j ${n_core}"
fi

echo "[BUILD] Set multicore run option for setup.py: ${run_multicore}"
Expand Down Expand Up @@ -443,15 +448,26 @@ build_fbgemm_gpu_package () {
echo "################################################################################"
echo ""

# Distribute Python extensions as wheels on Linux
# Set packaging options
build_args+=(
--package_channel="${fbgemm_release_channel}"
--python-tag="${python_tag}"
--plat-name="${python_plat_name}"
)

# Prepend build options correctly for `python -m build`
# https://build.pypa.io/en/stable/index.html
# https://gregoryszorc.com/blog/2023/10/30/my-user-experience-porting-off-setup.py/
for i in "${!build_args[@]}"; do
build_args[i]="--config-setting=--build-option=${build_args[i]}"
done

# Build the wheel. Invoke using `python -m build`
# https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html
echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
python setup.py "${run_multicore}" bdist_wheel \
--package_channel="${fbgemm_release_channel}" \
--python-tag="${python_tag}" \
--plat-name="${python_plat_name}" \
--verbose \
python -m build --wheel --no-isolation \
"${build_args[@]}"

# Run checks on the built libraries
Expand Down Expand Up @@ -503,7 +519,6 @@ build_fbgemm_gpu_install () {
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
python setup.py "${run_multicore}" install \
--verbose \
"${build_args[@]}"

# Run checks on the built libraries
Expand All @@ -519,47 +534,3 @@ build_fbgemm_gpu_install () {

echo "[BUILD] FBGEMM-GPU build + install completed"
}

build_fbgemm_gpu_develop () {
env_name="$1"
fbgemm_variant="$2"
fbgemm_variant_targets="$3"
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env cpu # CPU-only variant"
echo " ${FUNCNAME[0]} build_env cuda # CUDA variant for default target(s)"
echo " ${FUNCNAME[0]} build_env cuda '7.0;8.0' # CUDA variant for custom target(s)"
echo " ${FUNCNAME[0]} build_env rocm # ROCm variant for default target(s)"
echo " ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)"
return 1
fi

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Set up and configure the build
__build_fbgemm_gpu_common_pre_steps || return 1
__configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1

echo "################################################################################"
echo "# Build + Install FBGEMM-GPU Package (Develop)"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
echo ""

# Parallelism may need to be limited to prevent the build from being
# canceled for going over ulimits
echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
python setup.py "${run_multicore}" build develop \
--verbose \
"${build_args[@]}"

# Run checks on the built libraries
(run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1

echo "[BUILD] FBGEMM-GPU build + develop completed"
}
29 changes: 22 additions & 7 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,22 @@ run_fbgemm_gpu_tests () {
done
}

test_all_fbgemm_gpu_modules () {
local env_name="$1"
local fbgemm_variant="$2"

local target_directories=(
fbgemm_gpu/test
fbgemm_gpu/experimental/example/test
)

for test_dir in "${target_directories[@]}"; do
cd "${test_dir}" || return 1
run_fbgemm_gpu_tests "${env_name}" "${fbgemm_variant}" || return 1
cd - || return 1
done
}


################################################################################
# FBGEMM_GPU Test Bulk-Combination Functions
Expand Down Expand Up @@ -292,9 +308,8 @@ test_fbgemm_gpu_build_and_install () {
cd ~/FBGEMM/ || return 1
install_fbgemm_gpu_wheel "${env_name}" fbgemm_gpu/dist/*.whl || return 1

cd ~/FBGEMM/fbgemm_gpu/test || return 1
run_fbgemm_gpu_tests "${env_name}" "${pytorch_variant_type}" || return 1
cd - || return 1
cd ~/FBGEMM/ || return 1
test_all_fbgemm_gpu_modules "${env_name}" "${pytorch_variant_type}" || return 1
}

test_fbgemm_gpu_setup_and_pip_install () {
Expand Down Expand Up @@ -323,11 +338,11 @@ test_fbgemm_gpu_setup_and_pip_install () {

local env_name="test_py${py_version}_pytorch_${pytorch_channel_version}_fbgemm_${fbgemm_gpu_channel_version}_${variant_type}/${variant_version}"
local env_name="${env_name//\//_}"
test_setup_conda_environment "${env_name}" 'no-compiler' "${py_version}" pip "${pytorch_channel_version}" "${variant_type}" "${variant_version}" || return 1
install_fbgemm_gpu_pip "${env_name}" "${fbgemm_gpu_channel_version}" "${variant_type}/${variant_version}" || return 1
cd ~/FBGEMM/fbgemm_gpu/test || return 1
test_setup_conda_environment "${env_name}" 'no-compiler' "${py_version}" pip "${pytorch_channel_version}" "${variant_type}" "${variant_version}" || return 1
install_fbgemm_gpu_pip "${env_name}" "${fbgemm_gpu_channel_version}" "${variant_type}/${variant_version}" || return 1
cd ~/FBGEMM || return 1

run_fbgemm_gpu_tests "${env_name}" "${variant_type}";
test_all_fbgemm_gpu_modules "${env_name}" "${variant_type}";
local retcode=$?

echo "################################################################################"
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/nova_postscript.bash
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ else
fi

$CONDA_RUN python3 -c "import torch; print('cuda.is_available() ', torch.cuda.is_available()); print ('device_count() ',torch.cuda.device_count());"
cd "${FBGEMM_REPO}/fbgemm_gpu/test" || { echo "[NOVA] Failed to cd to fbgemm_gpu/test from $(pwd)"; };
run_fbgemm_gpu_tests "${BUILD_ENV_NAME}" "${fbgemm_variant}"
cd "${FBGEMM_REPO}" || { echo "[NOVA] Failed to cd to ${FBGEMM_REPO} from $(pwd)"; };
test_all_fbgemm_gpu_modules "${BUILD_ENV_NAME}" "${fbgemm_variant}"

# Workaround EACCES: permission denied error at checkout step
chown -R 1000:1000 /__w/FBGEMM/FBGEMM/ || echo "Unable to chown 1000:1000 from $USER, uid: $(id -u)"
5 changes: 3 additions & 2 deletions .github/scripts/utils_base.bash
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,16 @@ env_name_or_prefix () {
}

test_network_connection () {
wget -q --timeout 1 pypi.org -O /dev/null
exec_with_retries 3 wget -q --timeout 1 pypi.org -O /dev/null
local exit_status=$?

# https://man7.org/linux/man-pages/man1/wget.1.html
if [ $exit_status == 0 ]; then
echo "[CHECK] Network does not appear to be blocked."
else
echo "[CHECK] Network check exit status: ${exit_status}"
echo "[CHECK] Network appears to be blocked; please proxy the network connetions, i.e. re-run the command prefixed with 'with-proxy'."
echo "[CHECK] Network appears to be blocked or suffering from poor connection."
echo "[CHECK] Please remember to proxy the network connetions if needed, i.e. re-run the command prefixed with 'with-proxy'."
return 1
fi
}
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/utils_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ install_build_tools () {
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
bazel \
build \
click \
cmake \
hypothesis \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ jobs:
- name: Test with PyTest
timeout-minutes: ${{ matrix.host-machine.timeout }}
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu

- name: Push Wheel to PyPI
if: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true')) && matrix.compiler == 'gcc' }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 20
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda

- name: Push Wheel to PyPI
if: ${{ (github.event_name == 'schedule' && matrix.cuda-version == matrix.cuda-version-publish) || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish) }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -191,4 +191,4 @@ jobs:

- name: Test with PyTest
timeout-minutes: 20
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV rocm
6 changes: 3 additions & 3 deletions .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: ${{ matrix.host-machine.timeout }}
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu


test_pypi_install_cuda:
Expand Down Expand Up @@ -159,7 +159,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 20
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda


test_pypi_install_rocm:
Expand Down Expand Up @@ -225,4 +225,4 @@ jobs:

- name: Test with PyTest
timeout-minutes: 20
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV rocm
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_release_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ jobs:
- name: Test with PyTest
timeout-minutes: ${{ matrix.host-machine.timeout }}
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cpu

- name: Push FBGEMM_GPU (CPU version) Binary to PYPI
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_release_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ jobs:

- name: Test with PyTest
timeout-minutes: 20
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cuda
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV cuda

- name: Push FBGEMM_GPU Binary to PYPI
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
Expand Down
8 changes: 2 additions & 6 deletions bench/ConvUnifiedBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,12 +281,8 @@ void performance_test(
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
cout << "WARNING: the timer may be inaccurate when used by multiple threads."
<< endl;
cout << header << "Im2Col (ms), "
<< "Packing (ms), "
<< "Kernel (ms), "
<< "Postprocessing (ms), "
<< "fbgemmPacked (ms), "
<< "Total (ms), "
cout << header << "Im2Col (ms), " << "Packing (ms), " << "Kernel (ms), "
<< "Postprocessing (ms), " << "fbgemmPacked (ms), " << "Total (ms), "
<< "GOPS" << endl;
#else
cout << setw(6) << header << setw(5) << "GOPS" << endl;
Expand Down
5 changes: 2 additions & 3 deletions bench/ConvertBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ void performance_test() {
normal_distribution<float> dist;
default_random_engine engine;

cout << setw(4) << "M"
<< " elements_per_sec_ref"
<< " elements_per_sec_simd" << endl;
cout << setw(4) << "M" << " elements_per_sec_ref" << " elements_per_sec_simd"
<< endl;

array<int, 8> dims{1, 10, 32, 40, 129, 256, 1024, 8000};

Expand Down
8 changes: 3 additions & 5 deletions bench/EmbeddingQuantizeBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,9 @@ void performance_test() {
} else {
cout << "With scale and bias as float" << endl;
}
cout << setw(8) << "bit_rate"
<< ", " << setw(6) << "rows"
<< "," << setw(6) << "cols"
<< "," << setw(16) << "elems_per_usec"
<< "," << setw(10) << "GB/Sec" << endl;
cout << setw(8) << "bit_rate" << ", " << setw(6) << "rows" << "," << setw(6)
<< "cols" << "," << setw(16) << "elems_per_usec" << "," << setw(10)
<< "GB/Sec" << endl;
std::vector<int> bit_rates;
if (is_same<T, float16>::value) {
bit_rates = {2, 4, 8};
Expand Down
18 changes: 8 additions & 10 deletions bench/EmbeddingSpMDMNBitBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,17 +352,15 @@ int run_benchmark(
cout << "prefetch off, ";
}

cout << "b/w, " << bytes / 1e9 / t << ", GB/s, "
<< "effective b/w, " << bytes_padded / 1e9 / t << ", GB/s, "
<< "time, " << t << ", autovec b/w, " << bytes / 1e9 / t_autovec
<< ", GB/s, "
cout << "b/w, " << bytes / 1e9 / t << ", GB/s, " << "effective b/w, "
<< bytes_padded / 1e9 / t << ", GB/s, " << "time, " << t
<< ", autovec b/w, " << bytes / 1e9 / t_autovec << ", GB/s, "
<< "autovec eff. b/w, " << bytes_padded / 1e9 / t_autovec
<< ", GB/s, "
<< "autovec time, " << t_autovec << ", ref b/w, "
<< bytes / 1e9 / t_ref << ", GB/s, "
<< "ref eff. b/w, " << bytes_padded / 1e9 / t_ref << ", GB/s, "
<< "ref time, " << t_ref << ", autovec speedup, "
<< t_ref / t_autovec << ", asmjit speedup, " << t_ref / t << endl;
<< ", GB/s, " << "autovec time, " << t_autovec << ", ref b/w, "
<< bytes / 1e9 / t_ref << ", GB/s, " << "ref eff. b/w, "
<< bytes_padded / 1e9 / t_ref << ", GB/s, " << "ref time, " << t_ref
<< ", autovec speedup, " << t_ref / t_autovec << ", asmjit speedup, "
<< t_ref / t << endl;
} // flush_cache
} // has_weight
return 0;
Expand Down
Loading

0 comments on commit a821cf4

Please sign in to comment.