diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml
index 5759349d2..e59dff34e 100644
--- a/.github/workflows/unit_test_4gpu.yaml
+++ b/.github/workflows/unit_test_4gpu.yaml
@@ -32,4 +32,4 @@ jobs:
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
         mkdir artifacts-to-be-uploaded
-        python ./test_runner.py artifacts-to-be-uploaded
+        python ./test_runner.py artifacts-to-be-uploaded --ngpu 4
diff --git a/.github/workflows/unit_test_8gpu.yaml b/.github/workflows/unit_test_8gpu.yaml
new file mode 100644
index 000000000..6499eb78f
--- /dev/null
+++ b/.github/workflows/unit_test_8gpu.yaml
@@ -0,0 +1,35 @@
+name: 8 GPU Unit Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+        mkdir artifacts-to-be-uploaded
+        python ./test_runner.py artifacts-to-be-uploaded --ngpu 8
diff --git a/test_runner.py b/test_runner.py
index f7a4c7a44..120859174 100755
--- a/test_runner.py
+++ b/test_runner.py
@@ -28,6 +28,9 @@ class OverrideDefinitions:
     requires_seed_checkpoint: bool = False
     ngpu: int = 4
 
+    def __repr__(self):
+        return self.test_descr
+
 
 def build_test_list(args):
     """
@@ -170,6 +173,22 @@ def build_test_list(args):
             ],
             "Checkpoint Integration Test - Save Model Weights Only bf16",
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    f"--job.dump_folder {args.output_dir}/pp_dp_tp/",
+                    "--experimental.pipeline_parallel_degree 2",
+                    "--experimental.pipeline_parallel_split_points layers.1",
+                    "--training.data_parallel_degree 2",
+                    "--training.tensor_parallel_degree 2",
+                    "--model.norm_type rmsnorm",  # fused_rmsnorm not yet compatible with TP
+                ],
+            ],
+            "PP+DP+TP 3D test",
+            requires_seed_checkpoint=True,
+            ngpu=8,
+        ),
     ]
     return integration_tests_flavors
 
@@ -188,7 +207,8 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):
     # run_test supports sequence of tests.
     for override_arg in test_flavor.override_args:
 
-        cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK=0,1,2,3 ./run_llama_train.sh"
+        all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
+        cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_llama_train.sh"
         if override_arg:
             cmd += " " + " ".join(override_arg)
         print(
@@ -229,13 +249,17 @@ def run_tests(args):
                 )
                 if is_integration_test:
                     for test_flavor in integration_tests_flavors[config_file]:
-                        run_test(test_flavor, full_path)
+                        if (args.ngpu == 8 and test_flavor.ngpu == 8) or (
+                            args.ngpu == 4 and test_flavor.ngpu <= 4
+                        ):
+                            run_test(test_flavor, full_path)
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("output_dir")
     parser.add_argument("--config_dir", default="./train_configs")
+    parser.add_argument("--ngpu", default=4, type=int)
     args = parser.parse_args()
 
     if not os.path.exists(args.output_dir):