Skip to content

Commit

Permalink
Merge pull request #53 from ROCm/rocm-jaxlib-v0.4.30-qa-multigpu-disa…
Browse files Browse the repository at this point in the history
…ble-triton

Add multigpu script and disable triton tests
  • Loading branch information
hsharsha authored Oct 15, 2024
2 parents f3e91a6 + 4ea5b6f commit c718ef3
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 6 deletions.
79 changes: 79 additions & 0 deletions build_tools/rocm/run_xla_multi_gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env bash
# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================

set -e
set -x

N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
# If rocm-smi exists locally (it should) use it to find
# out how many GPUs we have to test with.
rocm-smi -i
STATUS=$?
if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
fi
if [[ $TF_GPU_COUNT -lt 4 ]]; then
echo "Found only ${TF_GPU_COUNT} gpus, multi-gpu tests need atleast 4 gpus."
exit
fi

TF_TESTS_PER_GPU=1
N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})

echo ""
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
echo ""

# First positional argument (if any) specifies the ROCM_INSTALL_DIR
if [[ -n $1 ]]; then
ROCM_INSTALL_DIR=$1
else
if [[ -z "${ROCM_PATH}" ]]; then
ROCM_INSTALL_DIR=/opt/rocm-6.0.2
else
ROCM_INSTALL_DIR=$ROCM_PATH
fi
fi

export PYTHON_BIN_PATH=`which python3`
export TF_NEED_ROCM=1
export ROCM_PATH=$ROCM_INSTALL_DIR
TAGS_FILTER="-oss_excluded,-oss_serial"
UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"

bazel \
test \
--config=rocm \
--build_tag_filters=${TAGS_FILTER} \
--test_tag_filters=${TAGS_FILTER} \
--test_timeout=920,2400,7200,9600 \
--test_sharding_strategy=disabled \
--test_output=errors \
--flaky_test_attempts=3 \
--keep_going \
--local_test_jobs=${N_TEST_JOBS} \
--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
--test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
-- //xla/tests:collective_ops_test_e2e_gpu_amd_any \
//xla/tests:collective_ops_test_gpu_amd_any \
//xla/tests:replicated_io_feed_test_gpu_amd_any \
//xla/tools/multihost_hlo_runner:functional_hlo_runner_test_gpu_amd_any \
//xla/pjrt/distributed:topology_util_test \
//xla/pjrt/distributed:client_server_test
7 changes: 2 additions & 5 deletions xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,6 @@ xla_test(
backends = [
"gpu_a100",
"gpu_h100",
"gpu_amd_any",
],
shard_count = 10,
tags = ["nomac"],
Expand Down Expand Up @@ -1255,15 +1254,14 @@ xla_test(
backends = [
"gpu_a100",
"gpu_h100",
"gpu_amd_any",
],
deps = [
":gpu_device_info_for_tests",
":ir_emitter_triton",
":triton_fusion_analysis",
":triton_support",
":triton_test_utils",
"//third_party/protobuf",
"third_party/protobuf",
"//xla:xla_data_proto_cc",
"//xla:xla_proto_cc",
"//xla/hlo/ir:hlo",
Expand All @@ -1284,7 +1282,6 @@ xla_test(
backends = [
"gpu_a100",
"gpu_h100",
"gpu_amd_any",
],
tags = ["nomac"],
deps = [
Expand Down Expand Up @@ -6165,7 +6162,7 @@ xla_test(
backend_tags = {"gpu": [
"requires-gpu-sm80",
]},
backends = ["gpu"],
backends = ["gpu_a100", "gpu_h100"],
deps = [
":autotuner_compile_util",
":autotuner_util",
Expand Down
1 change: 0 additions & 1 deletion xla/service/gpu/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,6 @@ xla_test(
backends = [
"gpu_a100",
"gpu_v100",
"gpu_amd_any",
],
deps = [
":gpu_codegen_test",
Expand Down
1 change: 1 addition & 0 deletions xla/tests/collective_ops_test_e2e.cc
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,7 @@ ENTRY main.12 {

TEST_F(CollectiveOpsTestE2EWindowedNonWindowed,
WindowedEinsumE2EAllGatherAndReduceScatterF8) {
GTEST_SKIP() << "F8E4M3 not supported";
absl::string_view kModuleReplicatedStr = R"(
HloModule pjit__unnamed_wrapped_function_, entry_computation_layout={(<<F8E4M3>>[2,16,48]{2,1,0}, <<F8E4M3>>[48,192]{1,0}, <<F8E4M3>>[192,48]{1,0}, bf16[], bf16[], bf16[], bf16[], bf16[])->bf16[2,16,48]{2,1,0}}, allow_spmd_sharding_propagation_to_parameters={false,false,false,false}, num_partitions=4
Expand Down
2 changes: 2 additions & 0 deletions xla/tools/multihost_hlo_runner/functional_hlo_runner_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ TEST_F(FunctionalHloRunnerTest, ShardedAutotuningWorks) {
GTEST_SKIP() << "GPU-only test.";
}

GTEST_SKIP() << "Triton is not enabled.";

tsl::SubProcess child[kNumNodes];
for (int node_id = 0; node_id < kNumNodes; ++node_id) {
std::vector<std::string> argv;
Expand Down

0 comments on commit c718ef3

Please sign in to comment.