From 3ba37000b64eca3539e7a0929a95e920e7d37144 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 19 Jan 2024 21:44:44 -0500 Subject: [PATCH 01/10] Fix BERT_pytorch install --- torchbenchmark/models/BERT_pytorch/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchbenchmark/models/BERT_pytorch/install.py b/torchbenchmark/models/BERT_pytorch/install.py index 26ab44a808..c8ae2b8a5f 100644 --- a/torchbenchmark/models/BERT_pytorch/install.py +++ b/torchbenchmark/models/BERT_pytorch/install.py @@ -3,7 +3,7 @@ def setup_install(): - subprocess.check_call([sys.executable, 'setup.py', 'develop']) + subprocess.check_call([sys.executable, '-m', "pip", 'install', '-e', '.']) if __name__ == '__main__': setup_install() From fd43ba660565ca909b189fb03755c6bbdaa36ae6 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 19 Jan 2024 21:56:57 -0500 Subject: [PATCH 02/10] Fixed install --- torchbenchmark/models/BERT_pytorch/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchbenchmark/models/BERT_pytorch/install.py b/torchbenchmark/models/BERT_pytorch/install.py index c8ae2b8a5f..f1533e07a6 100644 --- a/torchbenchmark/models/BERT_pytorch/install.py +++ b/torchbenchmark/models/BERT_pytorch/install.py @@ -3,7 +3,7 @@ def setup_install(): - subprocess.check_call([sys.executable, '-m', "pip", 'install', '-e', '.']) + subprocess.check_call([sys.executable, '-m', "pip", 'install', '--user', '-e', '.']) if __name__ == '__main__': setup_install() From 45ce705032391eaf8e65f48a57e3800f453a105e Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Fri, 19 Jan 2024 23:00:21 -0500 Subject: [PATCH 03/10] Fix bert pytorch install --- torchbenchmark/models/BERT_pytorch/install.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchbenchmark/models/BERT_pytorch/install.py b/torchbenchmark/models/BERT_pytorch/install.py index f1533e07a6..8b7470c4d7 100644 --- a/torchbenchmark/models/BERT_pytorch/install.py +++ b/torchbenchmark/models/BERT_pytorch/install.py @@ -6,4 +6,5 @@ def setup_install(): subprocess.check_call([sys.executable, '-m', "pip", 'install', '--user', '-e', '.']) if __name__ == '__main__': - setup_install() + # setup_install() + pass From fa6ca382ba6938be7ce7d086f54af6afc590ebd8 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Sun, 21 Jan 2024 23:57:16 -0500 Subject: [PATCH 04/10] Remove A10G ci due to limited disk space issue. --- .github/workflows/pr-a10g.yml | 63 ----------------------------------- 1 file changed, 63 deletions(-) delete mode 100644 .github/workflows/pr-a10g.yml diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/pr-a10g.yml deleted file mode 100644 index c5f549443d..0000000000 --- a/.github/workflows/pr-a10g.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: TorchBench PR Test on A10G -on: - pull_request: - workflow_dispatch: - push: - branches: - - main - -env: - CONDA_ENV: "torchbench" - DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" - SETUP_SCRIPT: "/workspace/setup_instance.sh" - HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} - - -jobs: - pr-test: - # AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu - # OS version: Amazon Linux 2 - runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu] - timeout-minutes: 1440 # 24 hours - environment: docker-s3-upload - steps: - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main - with: - github-secret: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }} - - name: Checkout TorchBench - uses: actions/checkout@v3 - with: - path: benchmark - - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ env.DOCKER_IMAGE }} - - name: Install NVIDIA Driver, docker runtime, set GPU_FLAG - id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main - - name: Install and Test TorchBench - run: | - container_name=$(docker run \ - -e CONDA_ENV="${CONDA_ENV}" \ - -e SETUP_SCRIPT="${SETUP_SCRIPT}" \ - -e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \ - --tty \ - --detach \ - --shm-size=32gb \ - -v "${PWD}/benchmark:/benchmark" \ - --gpus all \ - -w / \ - "${{ env.DOCKER_IMAGE }}" \ - tail -f /dev/null - ) - echo "Container name: ${container_name}" - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh - - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main - if: always() - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true From 40b1a32e81121f019c503a8fb01edf3c1071d1ab Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 22 Jan 2024 12:19:08 -0500 Subject: [PATCH 05/10] Test a10g container disk size --- .github/workflows/pr-a10g.yml | 66 +++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 .github/workflows/pr-a10g.yml diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/pr-a10g.yml new file mode 100644 index 0000000000..2d1d185594 --- /dev/null +++ b/.github/workflows/pr-a10g.yml @@ -0,0 +1,66 @@ +name: TorchBench PR Test on A10G +on: + pull_request: + workflow_dispatch: + push: + branches: + - main + +env: + CONDA_ENV: "torchbench" + DOCKER_IMAGE: "ghcr.io/pytorch/torchbench:latest" + SETUP_SCRIPT: "/workspace/setup_instance.sh" + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + + +jobs: + pr-test: + # AWS A10G GPU instance label: linux.g5.4xlarge.nvidia.gpu + # OS version: Amazon Linux 2 + runs-on: [self-hosted, linux.g5.4xlarge.nvidia.gpu] + timeout-minutes: 1440 # 24 hours + environment: docker-s3-upload + steps: + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }} + - name: Checkout TorchBench + uses: actions/checkout@v3 + with: + path: benchmark + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ env.DOCKER_IMAGE }} + - name: Install NVIDIA Driver, docker runtime, set GPU_FLAG + id: install-nvidia-driver + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + - name: Install and Test TorchBench + run: | + container_name=$(docker run \ + -e CONDA_ENV="${CONDA_ENV}" \ + -e SETUP_SCRIPT="${SETUP_SCRIPT}" \ + -e HUGGING_FACE_HUB_TOKEN="${HUGGING_FACE_HUB_TOKEN}" \ + --tty \ + --detach \ + --shm-size=32gb \ + -v "${PWD}/benchmark:/benchmark" \ + --gpus all \ + -w / \ + "${{ env.DOCKER_IMAGE }}" \ + tail -f /dev/null + ) + echo "Container name: ${container_name}" + df -h + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh + docker exec -t -w "/benchmark" "${container_name}" df -h + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + From 4e4e20a8aeddc370408166ad4cea5c58f2fd7a32 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 22 Jan 2024 12:20:40 -0500 Subject: [PATCH 06/10] Another fix --- .github/workflows/pr-a10g.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/pr-a10g.yml index 2d1d185594..c3bd311ee1 100644 --- a/.github/workflows/pr-a10g.yml +++ b/.github/workflows/pr-a10g.yml @@ -38,6 +38,7 @@ jobs: uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Install and Test TorchBench run: | + df -h container_name=$(docker run \ -e CONDA_ENV="${CONDA_ENV}" \ -e SETUP_SCRIPT="${SETUP_SCRIPT}" \ @@ -52,9 +53,8 @@ jobs: tail -f /dev/null ) echo "Container name: ${container_name}" - df -h - docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh docker exec -t -w "/benchmark" "${container_name}" df -h + docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main From 1917f10d225923fbde1239b847b5b116260d9455 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 22 Jan 2024 12:48:00 -0500 Subject: [PATCH 07/10] Test ci From 15da2f046581270516fef6cc818f26b6e3a4b204 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 22 Jan 2024 14:19:00 -0500 Subject: [PATCH 08/10] Fix A10g CI workflow --- .github/workflows/pr-a10g.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/pr-a10g.yml index c3bd311ee1..d41c8f1a61 100644 --- a/.github/workflows/pr-a10g.yml +++ b/.github/workflows/pr-a10g.yml @@ -38,7 +38,6 @@ jobs: uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Install and Test TorchBench run: | - df -h container_name=$(docker run \ -e CONDA_ENV="${CONDA_ENV}" \ -e SETUP_SCRIPT="${SETUP_SCRIPT}" \ @@ -53,7 +52,7 @@ jobs: tail -f /dev/null ) echo "Container name: ${container_name}" - docker exec -t -w "/benchmark" "${container_name}" df -h + docker exec -t -w "/" "${container_name}" bash -c "sudo chown -R runner /benchmark; sudo chgrp -R runner /benchmark" docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_install.sh docker exec -t -w "/benchmark" "${container_name}" bash /benchmark/scripts/torchbench_test.sh - name: Teardown Linux From 5da4e09730bdd8b74b3cb5b4984068b129bcbd6c Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 22 Jan 2024 15:55:15 -0500 Subject: [PATCH 09/10] Fix installation issue. --- torchbenchmark/models/BERT_pytorch/install.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchbenchmark/models/BERT_pytorch/install.py b/torchbenchmark/models/BERT_pytorch/install.py index 8b7470c4d7..f1533e07a6 100644 --- a/torchbenchmark/models/BERT_pytorch/install.py +++ b/torchbenchmark/models/BERT_pytorch/install.py @@ -6,5 +6,4 @@ def setup_install(): subprocess.check_call([sys.executable, '-m', "pip", 'install', '--user', '-e', '.']) if __name__ == '__main__': - # setup_install() - pass + setup_install() From 70ecb8114b0e9965a2fb02750bcffbaa1f6db4b9 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 22 Jan 2024 15:57:00 -0500 Subject: [PATCH 10/10] Another fix --- .github/workflows/pr-a10g.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr-a10g.yml b/.github/workflows/pr-a10g.yml index d41c8f1a61..fb390a7638 100644 --- a/.github/workflows/pr-a10g.yml +++ b/.github/workflows/pr-a10g.yml @@ -62,4 +62,3 @@ jobs: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true -