diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..059d9f2
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,8 @@
+[run]
+# Exclude submodule directory from coverage
+omit =
+    launcher/nemo/nemo_framework_launcher/*
+    template/*
+
+[report]
+fail_under = 85
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..6b0164b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,31 @@
+---
+name: Bug report
+about: File a report to help us reproduce and fix the problem
+title: ''
+labels: 'bug'
+assignees: ''
+
+---
+
+## Describe the bug
+A clear and concise description of what the bug is.
+
+## How to Reproduce?
+A clear, step-by-step set of instructions to reproduce the bug.
+The provided code need to be **complete** and **runnable**, if additional data is needed, please include them in the issue.
+
+## Expected behavior
+A clear and concise description of what you expected to happen.
+
+## Screenshots, error messages or logs
+If applicable, please share with us screenshots, error messages or logs to help explain your problem.
+
+## System information
+A description of your system. Please provide:
+- **Docker image you ran against**:
+- **Source code version you ran against**:
+- **Python version**:
+- **Hardware accelerator used**:
+
+## Additional context
+Add any other context about the problem here. Please provide any additional steps you have tried to solve your issue here.
diff --git a/.github/ISSUE_TEMPLATE/documentation_request.md b/.github/ISSUE_TEMPLATE/documentation_request.md
new file mode 100644
index 0000000..4d349c9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation_request.md
@@ -0,0 +1,17 @@
+---
+name: Documentation request
+about: Request improved documentation
+title: ''
+labels: 'documentation request'
+assignees: ''
+
+---
+
+## What did you find confusing?
+A clear and concise description of what you found confusing. Ex. I tried to [...] but I didn't understand how to [...]
+
+## Describe how documentation can be improved
+A clear and concise description of where documentation was lacking and how it can be improved.
+
+## Additional context
+Add any other context or screenshots about the documentation request here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..f781f87
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest new functionality for this project
+title: ''
+labels: 'feature request'
+assignees: ''
+
+---
+
+## Describe the feature you'd like
+A clear and concise description of the functionality you want.
+
+## How would this feature be used?
+A clear and concise description of the use case for this feature. Please provide an example, if possible.
+
+## Describe alternatives you've considered
+A clear and concise description of any alternative solutions or features you've considered.
+
+## Additional context
+Add any other context about the feature request here.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..65582a7
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,25 @@
+## Description
+
+### Motivation
+Explain the motivation
+
+### Changes
+* List your changes
+
+### Testing
+Explain how the changes were tested
+
+## Merge Checklist
+Put an x in the boxes that apply. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request.
+
+### General
+ - [ ] I have read the [CONTRIBUTING](../CONTRIBUTING.md) doc
+ - [ ] I have run `pre-commit run --all-files` on my code. It will check for [this configuration](../.pre-commit-config.yaml).
+ - [ ] I have updated any necessary documentation, including [READMEs](../README.md) and API docs (if appropriate)
+ - [ ] I have verified the licenses used in the license-files artifact generated in the Python License Scan CI check. If the license workflow fails, kindly check the licenses used in the artifact.
+
+### Tests
+ - [ ] I have run `pytest` on my code and all unit tests passed.
+ - [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate)
+
+By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
diff --git a/.github/workflows/license-scan-runner-pull.yml b/.github/workflows/license-scan-runner-pull.yml
new file mode 100644
index 0000000..78059ee
--- /dev/null
+++ b/.github/workflows/license-scan-runner-pull.yml
@@ -0,0 +1,71 @@
+name: Python License Scan CI
+
+on:
+  pull_request:
+    branches:
+      - main # Change to the branch we want to target for PRs
+
+permissions:
+   id-token: write # This is required for requesting the JWT
+   contents: read # This is required for actions/checkout
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive # Checkout submodules as well
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8' # Set python version to 3.8
+
+      - name: Set up Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22 # Set node version to 22
+
+      - name: Install package dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r launcher/nemo/nemo_framework_launcher/requirements.txt
+          pip install -r requirements.txt
+          pip install pip-licenses
+
+      - name: Run license scanner
+        run: |
+          pip-licenses --with-description --order=license --format=json > LicenseArtifact.txt
+
+      - name: Upload license files as artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: LicenseArtifact
+          path: |
+            LicenseArtifact.txt
+          retention-days: 5
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          role-to-assume: ${{ secrets.RUNNER_ROLE_ARN }}
+          aws-region: us-west-2
+
+      - name: Git clone repolinter, get ruleset and run repolinter
+        run: |
+          cd ..
+          git clone https://github.com/todogroup/repolinter.git
+          cd repolinter
+          npm install
+          aws s3 cp --recursive ${{ secrets.REPOLINTER_S3LINK }} ./repolinter-ruleset
+          node ./bin/repolinter.js lint ${{ github.workspace }} -r ./repolinter-ruleset/amazon-ospo-ruleset.json > ${{ github.workspace }}/scanOutput.txt
+
+      - name: Perform license check
+        run: |
+          cd ${{ github.workspace }}
+          aws s3 cp ${{ secrets.LICENSES_S3LINK }} ./ApprovedLicenses.txt
+          chmod +x scripts/licenseChecker.sh
+          ./scripts/licenseChecker.sh
diff --git a/.github/workflows/pre-commit-check-runner-pull.yml b/.github/workflows/pre-commit-check-runner-pull.yml
new file mode 100644
index 0000000..94c38ee
--- /dev/null
+++ b/.github/workflows/pre-commit-check-runner-pull.yml
@@ -0,0 +1,28 @@
+name: Python Pre Commit Check CI
+
+on:
+  pull_request:
+    branches:
+      - main # Change to the branch we want to target for PRs
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8' # Set python version to 3.8
+
+      - name: Install pre-commit dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit
+
+      - name: Run pre-commit checks
+        run: |
+          pre-commit run --all-files
diff --git a/.github/workflows/pre-commit-check-runner-push.yml b/.github/workflows/pre-commit-check-runner-push.yml
new file mode 100644
index 0000000..2ce5ac0
--- /dev/null
+++ b/.github/workflows/pre-commit-check-runner-push.yml
@@ -0,0 +1,28 @@
+name: Python Pre Commit Check CI After Commit
+
+on:
+  push:
+    branches:
+      - main # Triggers on direct pushes to the main branch
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8' # Set python version to 3.8
+
+      - name: Install pre-commit dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit
+
+      - name: Run pre-commit checks
+        run: |
+          pre-commit run --all-files
diff --git a/.github/workflows/repo-monitoring-cron.yml b/.github/workflows/repo-monitoring-cron.yml
new file mode 100644
index 0000000..692454b
--- /dev/null
+++ b/.github/workflows/repo-monitoring-cron.yml
@@ -0,0 +1,69 @@
+name: Repository Monitoring
+
+on:
+  schedule:
+    - cron: '0 16 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write # This is required for requesting the JWT
+  contents: read # This is required for actions/checkout
+
+jobs:
+  check-pr-alerts:
+    runs-on: ubuntu-latest
+    if: github.event.repository.visibility == 'public'
+    timeout-minutes: 10
+    outputs:
+      pr_count: ${{ steps.pr-count.outputs.count }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Check for open PRs
+        id: pr-count
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT }}
+        run: |
+          pr_count=$(gh pr list --state open --limit 1000 | wc -l)
+          echo "count=$pr_count" >> $GITHUB_OUTPUT
+
+  check-issue-alerts:
+    runs-on: ubuntu-latest
+    if: github.event.repository.visibility == 'public'
+    timeout-minutes: 10
+    outputs:
+      issue_count: ${{ steps.issue-count.outputs.count }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Check for open issues
+        id: issue-count
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_PAT }}
+        run: |
+          issue_count=$(gh issue list --state open --limit 1000 | wc -l)
+          echo "count=$issue_count" >> $GITHUB_OUTPUT
+
+  put-metric-data:
+    runs-on: ubuntu-latest
+    if: github.event.repository.visibility == 'public'
+    timeout-minutes: 10
+    needs: [check-pr-alerts, check-issue-alerts]
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          role-to-assume: ${{ secrets.RUNNER_ROLE_ARN }}
+          role-session-name: repo-monitoring-cron-session
+          aws-region: us-west-2
+
+      - name: Put PR Alert Metric Data
+        run: |
+          aws cloudwatch put-metric-data --metric-name PRAlert --namespace RepoMetrics --value ${{ needs.check-pr-alerts.outputs.pr_count }} --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
+
+      - name: Put Issue Alert Metric Data
+        run: |
+          aws cloudwatch put-metric-data --metric-name IssueAlert --namespace RepoMetrics --value ${{ needs.check-issue-alerts.outputs.issue_count }} --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
diff --git a/.github/workflows/security-monitoring-cron.yml b/.github/workflows/security-monitoring-cron.yml
new file mode 100644
index 0000000..e37aa44
--- /dev/null
+++ b/.github/workflows/security-monitoring-cron.yml
@@ -0,0 +1,100 @@
+name: Security Monitoring
+
+on:
+  schedule:
+    - cron: '0 16 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write # This is required for requesting the JWT
+  contents: read # This is required for actions/checkout
+
+jobs:
+  check-dependabot-alerts:
+    runs-on: ubuntu-latest
+    outputs:
+      dependabot_alert_status: ${{ steps.check-dependabot-alerts.outputs.dependabot_alert_status }}
+    steps:
+      - name: Check for dependabot alerts
+        id: check-dependabot-alerts
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea
+        with:
+          github-token: ${{ secrets.GH_PAT }}
+          script: |
+            async function checkAlerts() {
+              const owner = '${{ github.repository_owner }}';
+              const repo = '${{ github.event.repository.name }}';
+
+              const dependabotAlerts = await github.rest.dependabot.listAlertsForRepo({
+                owner,
+                repo,
+                headers: {
+                  'accept': 'applications/vnd.github+json'
+                }
+              });
+              const activeDependabotAlerts = dependabotAlerts.data.filter(alert => alert.state === 'open');
+              core.setOutput('dependabot_alert_status', activeDependabotAlerts.length > 0 ? '1': '0');
+            }
+            await checkAlerts();
+
+  check-code-scanning-alerts:
+    runs-on: ubuntu-latest
+    outputs:
+      code_scanning_alert_status: ${{ steps.check-code-scanning-alerts.outputs.code_scanning_alert_status }}
+    steps:
+      - name: Check for security alerts for public repository
+        id: check-code-scanning-alerts
+        if: github.event.repository.visibility == 'public'
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea
+        with:
+          github-token: ${{ secrets.GH_PAT }}
+          script: |
+            async function checkAlerts() {
+              const owner = '${{ github.repository_owner }}';
+              const repo = '${{ github.event.repository.name }}';
+              const ref = 'refs/heads/main';
+
+              const codeScanningAlerts = await github.rest.codeScanning.listAlertsForRepo({
+                owner,
+                repo,
+                ref: ref
+              });
+              const activeCodeScanningAlerts = codeScanningAlerts.data.filter(alert => alert.state === 'open');
+              return activeCodeScanningAlerts.length > 0 ? '1': '0';
+            }
+            await checkAlerts();
+      - name: Set code scanning alerts output
+        id: set-code-scanning-alerts-output
+        run: |
+          if ${{ github.event.repository.visibility == 'public' }}; then
+            echo "code_scanning_alert_status=${{ steps.check-code-scanning-alerts.outputs.result }}" >> $GITHUB_OUTPUT
+          else
+            echo "code_scanning_alert_status=0" >> $GITHUB_OUTPUT
+          fi
+
+  put-metric-data:
+    runs-on: ubuntu-latest
+    needs: [check-dependabot-alerts, check-code-scanning-alerts]
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@12e3392609eaaceb7ae6191b3f54bbcb85b5002b
+        with:
+          role-to-assume: ${{ secrets.RUNNER_ROLE_ARN }}
+          aws-region: us-west-2
+      - name: Put Dependabot Alert Metric Data
+        run: |
+          if [ "${{ needs.check-dependabot-alerts.outputs.dependabot_alert_status }}" == "1" ]; then
+            aws cloudwatch put-metric-data --metric-name DependabotAlert --namespace SecurityMonitoringMetrics --value 1 --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
+          else
+            aws cloudwatch put-metric-data --metric-name DependabotAlert --namespace SecurityMonitoringMetrics --value 0 --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
+          fi
+      - name: Put Code Scanning Alert Metric Data
+        run: |
+          if [ "${{ needs.check-code-scanning-alerts.outputs.code_scanning_alert_status }}" == "1" ]; then
+            aws cloudwatch put-metric-data --metric-name CodeScanningAlert --namespace SecurityMonitoringMetrics --value 1 --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
+          else
+            aws cloudwatch put-metric-data --metric-name CodeScanningAlert --namespace SecurityMonitoringMetrics --value 0 --unit Count --dimensions ProjectName=sagemaker-hyperpod-recipes
+          fi
diff --git a/.github/workflows/unit-test-runner-pull.yml b/.github/workflows/unit-test-runner-pull.yml
new file mode 100644
index 0000000..13365b9
--- /dev/null
+++ b/.github/workflows/unit-test-runner-pull.yml
@@ -0,0 +1,32 @@
+name: Python Unit Test CI
+
+on:
+  pull_request:
+    branches:
+      - main # Change to the branch we want to target for PRs
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive # Checkout submodules as well
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8' # Set python version to 3.8
+
+      - name: Install unit test dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r launcher/nemo/nemo_framework_launcher/requirements.txt
+          pip install pytest
+          pip install pytest-cov
+
+      - name: Run unit tests
+        run: |
+          python -m pytest
diff --git a/.github/workflows/unit-test-runner-push.yml b/.github/workflows/unit-test-runner-push.yml
new file mode 100644
index 0000000..7a83702
--- /dev/null
+++ b/.github/workflows/unit-test-runner-push.yml
@@ -0,0 +1,32 @@
+name: Python Unit Test CI After Commit
+
+on:
+  push:
+    branches:
+      - main # Triggers on direct pushes to the main branch
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive # Checkout submodules as well
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8' # Set python version to 3.8
+
+      - name: Install unit test dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r launcher/nemo/nemo_framework_launcher/requirements.txt
+          pip install pytest
+          pip install pytest-cov
+
+      - name: Run unit tests
+        run: |
+          python -m pytest
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d3150f6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# log and data files
+trace
+.DS_Store
+.hydra
+.bash_history.local
+results/
+outputs/
+tmp/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+**.pyc
+core.*
+
+# Unit test / coverage reports
+coverage_html_report/
+.coverage
+.coverage.*
+.cache
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Playground area
+mypg/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..8edcbe9
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "launcher/nemo/nemo_framework_launcher"]
+	path = launcher/nemo/nemo_framework_launcher
+	url = https://github.com/NVIDIA/NeMo-Framework-Launcher.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..4127bdc
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,33 @@
+default_language_version:
+  # force all unspecified python hooks to run python3
+  python: python3
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.3.0
+  hooks:
+    - id: end-of-file-fixer
+      exclude: ^(tests/slurm_workflow/slurm_baseline_artifacts/|tests/k8s_workflow/k8s_baseline_artifacts/|tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/)
+    - id: trailing-whitespace
+      exclude: ^(tests/slurm_workflow/slurm_baseline_artifacts/|tests/k8s_workflow/k8s_baseline_artifacts/|tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/)
+- repo: https://github.com/humitos/mirrors-autoflake.git
+  rev: v1.3
+  hooks:
+    - id: autoflake
+      args: ['--in-place', '--expand-star-imports', '--ignore-init-module-imports', '--remove-all-unused-imports']
+      additional_dependencies: [setuptools]
+- repo: https://github.com/psf/black
+  rev: 23.3.0
+  hooks:
+    - id: black
+      args: [--line-length=120]
+- repo: https://github.com/pocc/pre-commit-hooks
+  rev: v1.1.1
+  hooks:
+    - id: clang-format
+      args: [--style=file, -i]
+- repo: https://github.com/pycqa/isort
+  rev: 5.12.0
+  hooks: # imports sorting
+    - id: isort
+      name: isort (python)
+      args: ["--profile", "black"]
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..5b627cf
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,4 @@
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..c4b6a1c
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,59 @@
+# Contributing Guidelines
+
+Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
+documentation, we greatly value feedback and contributions from our community.
+
+Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
+information to effectively respond to your bug report or contribution.
+
+
+## Reporting Bugs/Feature Requests
+
+We welcome you to use the GitHub issue tracker to report bugs or suggest features.
+
+When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
+reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
+
+* A reproducible test case or series of steps
+* The version of our code being used
+* Any modifications you've made relevant to the bug
+* Anything unusual about your environment or deployment
+
+
+## Contributing via Pull Requests
+Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
+
+1. You are working against the latest source on the *main* branch.
+2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
+3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
+
+To send us a pull request, please:
+
+1. Fork the repository.
+2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
+3. Ensure local tests pass.
+4. Commit to your fork using clear commit messages.
+5. Send us a pull request, answering any default questions in the pull request interface.
+6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
+
+GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
+[creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
+
+
+## Finding contributions to work on
+Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
+
+
+## Code of Conduct
+This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
+For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
+opensource-codeofconduct@amazon.com with any additional questions or comments.
+
+
+## Security issue notifications
+If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
+
+
+## Licensing
+
+See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
diff --git a/Config b/Config
new file mode 100644
index 0000000..768cc9a
--- /dev/null
+++ b/Config
@@ -0,0 +1,26 @@
+package.SagemakerTrainingLauncher = {
+    interfaces = (1.0);
+
+    # Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild
+    build-system = no-op;
+    build-tools = {
+        1.0 = {
+            NoOpBuild = 1.0;
+        };
+    };
+
+    # Use runtime-dependencies for when you want to bring in additional
+    # packages when deploying.
+    # Use dependencies instead if you intend for these dependencies to
+    # be exported to other packages that build against you.
+    dependencies = {
+        1.0 = {
+        };
+    };
+
+    runtime-dependencies = {
+        1.0 = {
+        };
+    };
+
+};
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8f25a20
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,173 @@
+                                  Apache License
+                           Version 2.0, January 2004
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..616fc58
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bf890a4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,313 @@
+# Amazon SageMaker HyperPod recipes
+
+## Overview
+
+Amazon SageMaker HyperPod recipes help customers get started with training and fine-tuning popular publicly available foundation models in just minutes, with state-of-the-art performance. The recipes provide a pre-configured training stack that is tested and validated on Amazon SageMaker.
+
+Please see [Amazon SageMaker HyperPod Recipes](https://docs.aws.com/hyperpod/recipes) for documentation.
+
+The recipes support Amazon Sagemaker HyperPod (with Slurm or Amazon EKS for workload orchestration) and Amazon SageMaker training jobs.
+
+Amazon SageMaker HyperPod recipes include built-in support for:
+
+- Model parallelism - tensor parallelism and context parallel
+- Automated distributed checkpointing
+- Distributed optimizer
+- Accelerators: NVIDIA H100 (ml.p5), NVIDIA A100 (ml.p4), and AWS Trainium (ml.trn1)
+- Fine-tuning: Full, QLoRA, LoRA
+- AWS Instances: ml.p5.48xlarge, ml.p4d.24xlarge, and ml.trn1.32xlarge instance families
+- Supported Models: Llama, Mistral, Mixtral models
+- Model Evaluation: Tensorboard
+
+## Model Support
+
+### Pre-Training
+
+List of specific pre-training recipes used by the launch scripts.
+
+| Source       | Model     | Size | Sequence length | Nodes | Instance      | Accelerator | Recipe | Script |
+| ------------ | --------- | ---- | --------------- | ----- | ------------- | ----------- | ------ | ------ |
+| Hugging Face | Llama 3.2 | 11b  | 8192            | 4     | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh) |
+| Hugging Face | Llama 3.2 | 90b  | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Llama 3   | 70b  | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Llama 3   | 70b  | 16384           | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Llama 3   | 70b  | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Llama 3   | 70b  | 8192            | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Llama 3   | 70b  | 8192            | 16    | ml.trn1.32xlarge | TRN         | [link](recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh) |
+| Hugging Face | Llama 3   | 8b   | 16384           | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh) |
+| Hugging Face | Llama 3   | 8b   | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Llama 3   | 8b   | 8192            | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh) |
+| Hugging Face | Llama 3   | 8b   | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Llama 3   | 8b   | 8192            | 4     | ml.trn1.32xlarge | TRN         | [link](recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh) |
+| Megatron     | Llama 3   | 8b   | 8192            | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml) | - |
+| Hugging Face | Mistral   | 7b   | 16384           | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh) |
+| Hugging Face | Mistral   | 7b   | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Mistral   | 7b   | 8192            | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh) |
+| Hugging Face | Mistral   | 7b   | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Mixtral   | 22b  | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Mixtral   | 22b  | 16384           | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Mixtral   | 22b  | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Mixtral   | 22b  | 8192            | 64    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh) |
+| Hugging Face | Mixtral   | 7b   | 16384           | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh) |
+| Hugging Face | Mixtral   | 7b   | 16384           | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh) |
+| Hugging Face | Mixtral   | 7b   | 8192            | 16    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh) |
+| Hugging Face | Mixtral   | 7b   | 8192            | 32    | ml.p5.48xlarge   | GPU H100    | [link](recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml) | [link](launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh) |
+
+
+### Fine-Tuning
+
+List of specific fine-tuning recipes used by the launch scripts.
+All model sources are from Hugging Face.
+
+| Model     | Method | Size | Sequence length | Nodes | Instance       | Accelerator | Recipe | Script |
+| --------- | ------ | ---- | ----------------| ----- | -------------- | ----------- | ------ | ------ |
+| Llama 3   | QLoRA  | 405b | 131072          | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh) |
+| Llama 3   | LoRA   | 405b | 16384           | 6     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh) |
+| Llama 3   | QLoRA  | 405b | 16384           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh) |
+| Llama 3   | LoRA   | 405b | 8192            | 6     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh) |
+| Llama 3   | QLoRA  | 405b | 8192            | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh) |
+| Llama 3   | SFT    | 70b  | 16384           | 16    | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh) |
+| Llama 3   | LoRA   | 70b  | 16384           | 2     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh) |
+| Llama 3   | SFT    | 70b  | 8192            | 10    | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh) |
+| Llama 3   | LoRA   | 70b  | 8192            | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_lora.sh) |
+| Llama 3   | SFT    | 8b   | 16384           | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh) |
+| Llama 3   | LoRA   | 8b   | 16384           | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_lora.sh) |
+| Llama 3   | SFT    | 8b   | 8192            | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh) |
+| Llama 3   | LoRA   | 8b   | 8192            | 1     | ml.p5.48xlarge    | GPU H100    | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_lora.sh) |
+| Llama 3   | SFT    | 70b  | 8192            | 32    | ml.p4d.24xlarge   | GPU A100    | [link](recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh) |
+| Llama 3   | LoRA   | 70b  | 8192            | 20    | ml.p4d.24xlarge   | GPU A100    | [link](recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_lora.sh) |
+| Llama 3   | SFT    | 8b   | 8192            | 4     | ml.p4d.24xlarge   | GPU A100    | [link](recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning.yaml) | [link](launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh) |
+| Llama 3   | LoRA   | 8b   | 8192            | 1     | ml.p4d.24xlarge   | GPU A100    | [link](recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora.yaml) | [link](launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_lora.sh) |
+| Llama 3   | SFT    | 8b   | 8192            | 1     | ml.trn1.32xlarge  | TRN         | [link](recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_trn1_fine_tuning.yaml) | [link](launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh) |
+
+
+## Installation
+
+Amazon SageMaker HyperPod recipes should be installed on the head node of your HyperPod cluster or on your local machine with a virtual python environment.
+
+```
+git clone --recursive git@github.com:aws/sagemaker-hyperpod-recipes.git
+cd private-sagemaker-training-launcher-staging
+python3 -m venv venv
+source venv/bin/activate
+pip3 install -r requirements.txt
+```
+
+## Usage Guide
+
+When using the SageMaker HyperPod recipes, you can either
+create your own training script or leverage the SageMaker HyperPod adapter,
+which includes popular publicly-available models like Llama or Mistral. Based on your specific
+needs, you might need to modify the parameters defined in the recipes for
+pre-training or fine-tuning. Once your configurations are setup, you can run training on SageMaker
+HyperPod (with Slurm or Amazon EKS) for workload orchestration. Alternatively, you can run the recipe on
+SageMaker training jobs using the Amazon SageMaker Python SDK.
+
+### Running a recipe via a Slurm job on a SageMaker HyperPod cluster
+
+To run a recipe via a Slurm job on a HyperPod cluster, you need to SSH into the head node
+of the cluster and clone the HyperPod recipes repository onto a shared filesystem, such as
+FSX or NFS. Next, follow the installation instructions to set up a Python
+virtual environment with the required dependencies. Once the environment is
+ready, you can launch a training job from the launcher\_scripts folder. For
+example, you can modify the recipe launcher script [run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain](launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh)
+with customized configurations such as your image or output directory. Once
+setting all the necessary parameters in the recipe launcher, you can
+start the training process by running the script.
+
+```bash
+launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh
+```
+
+We recommend that you utilize `enroot` to initiate a training process on
+the Slurm cluster. You can get the latest docker image from [SMP release notes](https://docs.aws.amazon.com/sagemaker/latest/dg/distributed-model-parallel-support-v2.html). You can refer to the following example to generate a squash file
+employing the `enroot` command. Please refer to the following documentation on building an [AWS-optimized Nemo-Launcher image](https://github.com/aws-samples/awsome-distributed-training/tree/main/3.test_cases/2.nemo-launcher#2-build-aws-optimized-nemo-launcher-image).
+
+```bash
+REGION="us-west-2"
+IMAGE="658645717510.dkr.ecr.${REGION}.amazonaws.com/smdistributed-modelparallel:${TAG}"
+aws ecr get-login-password --region "${REGION}" | docker login --username AWS --password-stdin 855988369404.dkr.ecr.${REGION}.amazonaws.com
+enroot import -o $PWD/smdistributed-modelparallel.sqsh dockerd://${IMAGE}
+mv $PWD/smdistributed-modelparallel.sqsh "/fsx/smdistributed-modelparallel.sqsh"
+```
+
+To use a prebuilt enroot:
+```
+wget https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/enroot/2.4.1-gpu-py311-cu121-ubuntu20.04-sagemaker-smpv2.7.0.sqsh
+```
+
+To use the Enroot squash file to start training, use the following example to
+modify the `recipes_collection/config.yaml` file.
+
+```
+container: /fsx/smdistributed-modelparallel.sqsh
+```
+
+### Running a recipe on a SageMaker HyperPod clusters orchestrated by Amazon EKS
+
+Prior to commencing training on your cluster, you are required to
+configure your local environment by adhering to the installation instructions.
+Additionally, you will need to install Kubectl and Helm on your local machine.
+Refer to the following documentation for installation of [Kubectl](https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html)
+and [Helm](https://helm.sh/docs/intro/install/).
+
+You can now proceed with submitting a training job by utilizing the same launcher script with the
+following command:
+
+```
+aws eks update-kubeconfig --region "${CLUSTER_REGION}" --name "${CLUSTER_NAME}"
+launcher_scripts/llama/run_hf_llama3_8b_seq8192.sh
+```
+
+We recommend that you utilize [HyperPod command-line tool](https://github.com/aws/sagemaker-hyperpod-cli)
+to launch a training job.
+
+```
+hyperpod start-job --recipe training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain \
+--persistent-volume-claims fsx-claim:data \
+--override-parameters \
+'{
+ "recipes.run.name": "hf-llama3-8b",
+ "recipes.exp_manager.exp_dir": "/data/<your_exp_dir>",
+ "container": "658645717510.dkr.ecr.<region>.amazonaws.com/smdistributed-modelparallel:2.4.1-gpu-py311-cu121",
+ "recipes.model.data.train_dir": "<your_train_data_dir>",
+ "recipes.model.data.val_dir": "<your_val_data_dir>",
+ "cluster": "k8s",
+ "cluster_type": "k8s"
+}'
+```
+
+### Running a recipe on SageMaker training jobs
+
+SageMaker training jobs automatically spin up a resilient distributed training cluster,
+monitors the infrastructure, and auto-recovers from faults to ensure a smooth training experience.
+You can leverage the SageMaker Python SDK to execute your recipes on SageMaker training jobs.
+
+```
+python3 -m venv venv
+source venv/bin/activate
+pip3 install --upgrade pip setuptools
+
+# install SageMaker SDK
+pip install --upgrade sagemaker
+```
+
+The following Python code-snippet demonstrates how to submit a recipe to
+run on a SageMaker training jobs by utilizing the `PyTorch`
+estimator from the SageMaker Python SDK.
+
+For example, to run the llama3-8b recipe on
+a SageMaker training jobs, you need to set `training_recipe` arg to indicate which recipe: this
+can be a recipe from one of the available ones, or a url or a local yaml file containing a modified
+recipe. Please also modify the local directory paths and hf access token either by providing
+`recipe_overrides` or by modifying the recipe yaml file directly (the url or local file).
+
+```python
+import os
+import sagemaker,boto3
+from sagemaker.debugger import TensorBoardOutputConfig
+
+from sagemaker.pytorch import PyTorch
+
+sagemaker_session = sagemaker.Session()
+role = sagemaker.get_execution_role()
+
+bucket = sagemaker_session.default_bucket()
+output = os.path.join(f"s3://{bucket}", "output")
+output_path = "<s3 url>"
+
+overrides = {
+    "run": {
+        "results_dir": "/opt/ml/model",
+    },
+    "exp_manager": {
+        "exp_dir": "",
+        "explicit_log_dir": "/opt/ml/output/tensorboard",
+        "checkpoint_dir": "/opt/ml/checkpoints",
+    },
+    "model": {
+        "data": {
+            "train_dir": "/opt/ml/input/data/train",
+            "val_dir": "/opt/ml/input/data/val",
+        },
+    },
+}
+
+tensorboard_output_config = TensorBoardOutputConfig(
+    s3_output_path=os.path.join(output, 'tensorboard'),
+    container_local_output_path=overrides["exp_manager"]["explicit_log_dir"]
+)
+
+estimator = PyTorch(
+  output_path=output_path,
+  base_job_name=f"llama-recipe",
+  role=role,
+  instance_type="ml.p5.48xlarge",
+  training_recipe="training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain",
+  recipe_overrides=recipe_overrides,
+  sagemaker_session=sagemaker_session,
+  tensorboard_output_config=tensorboard_output_config,
+)
+
+estimator.fit(inputs={"train": "s3 or fsx input", "val": "s3 or fsx input"}, wait=True)
+```
+
+Running the above code creates a `PyTorch` estimator object with the specified training recipe
+and then trains the model using the `fit()` method. The new `training_recipe` parameter enables you
+to specify the recipe you want to use.
+
+
+## Troubleshooting
+
+During training, if GPU memory usage approaches its limit, attempting to save sharded checkpoints to an S3 storage may result in a core dump.
+To address this issue, you may choose to:
+
+* Reduce the overall memory consumption of the model training:
+  * Increase the number of compute nodes for the traninig process.
+  * Decrease the batch size
+  * Increase the sharding degrees, etc.
+* Use FSx as the shared file system
+
+By taking one of the above approaches, you can alleviate the memory pressure and prevent a core dump from occurring during checkpoint saving.
+
+## Testing
+
+Follow the instructions on the "Installing" then use the following command to install the dependencies for testing:
+
+```
+pip install pytest
+pip install pytest-cov
+```
+
+### Unit Tests
+To run the unit tests, navigate to the root directory and use the command
+```python -m pytest``` plus any desired flags.
+
+The `pyproject.toml` file defines additional options that are always appended to the `pytest` command:
+```
+[tool.pytest.ini_options]
+...
+addopts = [
+    "--cache-clear",
+    "--quiet",
+    "--durations=0",
+    "--cov=launcher/",
+    # uncomment this line to see a detailed HTML test coverage report instead of the usual summary table output to stdout.
+    # "--cov-report=html",
+    "tests/",
+]
+```
+
+## Contributing
+We use pre-commit to unify our coding format, steps to setup as as follows:
+- Install pre-commit which helps us run formatters before commit using `pip install pre-commit`
+- Setup hooks from our pre-commit hook configs in `.pre-commit-config.yaml` using `pre-commit install`
+When you commit, pre-commit hooks will be applied. If for some reason you need to skip the check, you can run `git commit ... --no-verify` but make sure to include the reason to skip pre-commit in the commit message.
+
+## Security
+
+See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
+
+## License
+
+This project is licensed under the [Apache-2.0 License](LICENSE).
diff --git a/THIRD-PARTY.txt b/THIRD-PARTY.txt
new file mode 100644
index 0000000..18a8b9b
--- /dev/null
+++ b/THIRD-PARTY.txt
@@ -0,0 +1,175 @@
+** NeMo-Framework-Launcher; version 23.11 -- https://github.com/NVIDIA/NeMo-Framework-Launcher
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, "control" means (i) the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+"Object" form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+"submitted" means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this
+License, each Contributor hereby grants to You a perpetual, worldwide, non-
+exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce,
+prepare Derivative Works of, publicly display, publicly perform, sublicense, and
+distribute the Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this License,
+each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-
+charge, royalty-free, irrevocable (except as stated in this section) patent
+license to make, have made, use, offer to sell, sell, import, and otherwise
+transfer the Work, where such license applies only to those patent claims
+licensable by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s) with the Work
+to which such Contribution(s) was submitted. If You institute patent litigation
+against any entity (including a cross-claim or counterclaim in a lawsuit)
+alleging that the Work or a Contribution incorporated within the Work
+constitutes direct or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate as of the date
+such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or
+Derivative Works thereof in any medium, with or without modifications, and in
+Source or Object form, provided that You meet the following conditions:
+
+     (a) You must give any other recipients of the Work or Derivative Works a
+copy of this License; and
+
+     (b) You must cause any modified files to carry prominent notices stating
+that You changed the files; and
+
+     (c) You must retain, in the Source form of any Derivative Works that You
+distribute, all copyright, patent, trademark, and attribution notices from the
+Source form of the Work, excluding those notices that do not pertain to any part
+of the Derivative Works; and
+
+     (d) If the Work includes a "NOTICE" text file as part of its distribution,
+then any Derivative Works that You distribute must include a readable copy of
+the attribution notices contained within such NOTICE file, excluding those
+notices that do not pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+
+     You may add Your own copyright statement to Your modifications and may
+provide additional or different license terms and conditions for use,
+reproduction, or distribution of Your modifications, or for any such Derivative
+Works as a whole, provided Your use, reproduction, and distribution of the Work
+otherwise complies with the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any
+Contribution intentionally submitted for inclusion in the Work by You to the
+Licensor shall be under the terms and conditions of this License, without any
+additional terms or conditions. Notwithstanding the above, nothing herein shall
+supersede or modify the terms of any separate license agreement you may have
+executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names,
+trademarks, service marks, or product names of the Licensor, except as required
+for reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
+writing, Licensor provides the Work (and each Contributor provides its
+Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied, including, without limitation, any warranties
+or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any risks
+associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in
+tort (including negligence), contract, or otherwise, unless required by
+applicable law (such as deliberate and grossly negligent acts) or agreed to in
+writing, shall any Contributor be liable to You for damages, including any
+direct, indirect, special, incidental, or consequential damages of any character
+arising as a result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill, work stoppage,
+computer failure or malfunction, or any and all other commercial damages or
+losses), even if such Contributor has been advised of the possibility of such
+damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the Work or
+Derivative Works thereof, You may choose to offer, and charge a fee for,
+acceptance of support, warranty, indemnity, or other liability obligations
+and/or rights consistent with this License. However, in accepting such
+obligations, You may act only on Your own behalf and on Your sole
+responsibility, not on behalf of any other Contributor, and only if You agree to
+indemnify, defend, and hold each Contributor harmless for any liability incurred
+by, or claims asserted against, such Contributor by reason of your accepting any
+such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+* For NeMo-Framework-Launcher see also this required NOTICE:
+    Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
diff --git a/launcher/__init__.py b/launcher/__init__.py
new file mode 100644
index 0000000..6549052
--- /dev/null
+++ b/launcher/__init__.py
@@ -0,0 +1,12 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
diff --git a/launcher/accelerator_devices.py b/launcher/accelerator_devices.py
new file mode 100644
index 0000000..7ecc6b8
--- /dev/null
+++ b/launcher/accelerator_devices.py
@@ -0,0 +1,74 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+acceleratorDevices = {
+    "p4d.24xlarge": 8,
+    "p4de.24xlarge": 8,
+    "p5.48xlarge": 8,
+    "trn1.2xlarge": 1,
+    "trn1.32xlarge": 16,
+    "trn1n.32xlarge": 16,
+    "g5.xlarge": 1,
+    "g5.2xlarge": 1,
+    "g5.4xlarge": 1,
+    "g5.8xlarge": 1,
+    "g5.12xlarge": 4,
+    "g5.16xlarge": 1,
+    "g5.24xlarge": 4,
+    "g5.48xlarge": 8,
+}
+
+coresPerAcceleratorDevice = {
+    "p4d.24xlarge": 1,
+    "p4de.24xlarge": 1,
+    "p5.48xlarge": 1,
+    "trn1.2xlarge": 2,
+    "trn1.32xlarge": 2,
+    "trn1n.32xlarge": 2,
+    "g5.xlarge": 1,
+    "g5.2xlarge": 1,
+    "g5.4xlarge": 1,
+    "g5.8xlarge": 1,
+    "g5.12xlarge": 1,
+    "g5.16xlarge": 1,
+    "g5.24xlarge": 1,
+    "g5.48xlarge": 1,
+}
+
+
+def get_num_accelerator_devices(instance_type: str):
+    """
+    Get the number of accelerator devices on an instance type.
+    Accelerator device could be GPU or Trainium chips
+    :param instance_type: AWS EC2 instance type
+    :return: number of accelerator devices for the instance type or None if instance
+    type not in the accelerator devices map
+    """
+    if instance_type not in acceleratorDevices:
+        return None
+
+    return acceleratorDevices[instance_type]
+
+
+def get_num_cores_per_accelerator(instance_type: str):
+    """
+    Get the number of cores per accelerator device on an instance type.
+    Currently, Trainium has 2 cores per device while Nvida has 1 core per device.
+    :param instance_type: AWS EC2 instance type
+    :return: number of cores for the accelerator device or None if instance type
+    not in the map
+    """
+    if instance_type not in coresPerAcceleratorDevice:
+        return None
+
+    return coresPerAcceleratorDevice[instance_type]
diff --git a/launcher/config_validator/type_validator.py b/launcher/config_validator/type_validator.py
new file mode 100644
index 0000000..609d33d
--- /dev/null
+++ b/launcher/config_validator/type_validator.py
@@ -0,0 +1,151 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import os
+from typing import Any
+
+from omegaconf import DictConfig, OmegaConf
+
+from launcher.config_validator.value_validator import get_argument
+
+
+class TypeValidator:
+    # Define the list of key-type pairs
+    types = [
+        ("hydra.output_subdir", "path"),
+        ("hydra.run.dir", "path"),
+        ("git.repo_url_or_path", "string"),
+        ("git.branch", "string"),
+        ("git.commit", "string"),
+        ("training_cfg.entry_script", "path"),
+        ("training_cfg.script_args", "list_dict"),
+        ("training_cfg.run.name", "string"),
+        ("training_cfg.run.nodes", "positive_integer"),
+        ("training_cfg.run.ntasks_per_node", "positive_integer"),
+        ("cluster.cluster_type", "string"),
+        ("cluster.instance_type", "string"),
+        ("cluster.cluster_config", "dict"),
+        ("cluster.cluster_config.namespace", "string"),
+        ("cluster.cluster_config.custom_labels", "dict"),
+        ("cluster.cluster_config.annotations", "dict"),
+        ("cluster.cluster_config.priority_class_name", "string"),
+        ("cluster.cluster_config.label_selector", "dict"),
+        ("cluster.cluster_config.persistentVolumeClaims", "list_dict"),
+        ("cluster.cluster_config.volumes", "list_dict"),
+        ("cluster.cluster_config.pullPolicy", "string"),
+        ("cluster.cluster_config.restartPolicy", "string"),
+        ("base_results_dir", "path"),
+        ("container_mounts", "list_path"),
+        ("container", "string"),
+        ("env_vars", "dict"),
+    ]
+
+    def __init__(self, config: DictConfig):
+        self.config = config
+
+    def validate(self):
+        for key, type in self.types:
+            argument = get_argument(self.config, key)
+            _check_types(argument, type, key)
+
+
+def _is_valid_path(path) -> bool:
+    """
+    Check if the input string is a valid file path.
+
+    Parameters:
+    path (str): The path to validate.
+
+    Returns:
+    bool: True if the path is valid, False otherwise.
+    """
+    if not isinstance(path, str):
+        return False
+
+    try:
+        normalized_path = os.path.normpath(path)
+        return True
+    except Exception as e:
+        return False
+
+
+def _is_positive_integer(argument) -> bool:
+    try:
+        val = int(argument)
+        if val < 1:
+            return False
+    except ValueError:
+        return False
+    return True
+
+
+def _get_base_omega_conf_container(argument) -> (Any, bool):
+    try:
+        argument = OmegaConf.to_container(argument, resolve=True)
+    except Exception as e:
+        return None, False
+    return argument, True
+
+
+def _is_list_of_dicts(argument) -> bool:
+    argument, status = _get_base_omega_conf_container(argument)
+    if status is False:
+        return False
+    return isinstance(argument, list) and all(item is None or isinstance(item, dict) for item in argument)
+
+
+def _is_list_of_strings(argument) -> bool:
+    argument, status = _get_base_omega_conf_container(argument)
+    if status is False:
+        return False
+    return isinstance(argument, list) and all(item is None or isinstance(item, str) for item in argument)
+
+
+def _is_list_of_paths(argument) -> bool:
+    argument, status = _get_base_omega_conf_container(argument)
+    if status is False:
+        return False
+    return isinstance(argument, list) and all(item is None or _is_valid_path(item) for item in argument)
+
+
+def _is_dict(argument) -> bool:
+    argument, status = _get_base_omega_conf_container(argument)
+    if status is False:
+        return False
+    return isinstance(argument, dict)
+
+
+def _check_types(argument, type, argument_name) -> None:
+    if argument is None:
+        return
+
+    if type == "string" and not isinstance(argument, str):
+        raise TypeError("{} with val {} is not a string".format(argument_name, argument))
+
+    if type == "path" and not _is_valid_path(argument):
+        raise TypeError("{} with val {} is not a valid path".format(argument_name, argument))
+
+    if type == "list_string" and not _is_list_of_strings(argument):
+        raise TypeError("{} with val {} is not a list of string".format(argument_name, argument))
+
+    if type == "list_dict" and not _is_list_of_dicts(argument):
+        raise TypeError("{} with val {} is not a list of dictionary".format(argument_name, argument))
+
+    if type == "list_path" and not _is_list_of_paths(argument):
+        raise TypeError("{} with val {} is not a list of paths".format(argument_name, argument))
+
+    if type == "positive_integer" and not _is_positive_integer(argument):
+        raise TypeError("{} with val {} is not a positive integer".format(argument_name, argument))
+
+    if type == "dict" and not _is_dict(argument):
+        raise TypeError("{} with val {} is not a dictionary".format(argument_name, argument))
diff --git a/launcher/config_validator/value_validator.py b/launcher/config_validator/value_validator.py
new file mode 100644
index 0000000..a5fb2bd
--- /dev/null
+++ b/launcher/config_validator/value_validator.py
@@ -0,0 +1,257 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import re
+
+from omegaconf import DictConfig
+
+
+class ValueValidator:
+    def __init__(self, config: DictConfig):
+        self.config = config
+
+    def validate(self) -> None:
+        # For all below validations, we will check if the argument is present => it should pass the validation
+
+        # Mandatory arguments for all workflows
+        _validate_all_mandatory_argument(self.config)
+
+        # Cluster type argument check for all workflows
+        _validate_cluster_type_argument(self.config)
+
+        # PV argument check for all workflows
+        _validate_pv_arguments(self.config)
+
+        # Volume argument check for all workflows
+        _validate_volume_arguments(self.config)
+
+        # Pull policy argument check for all workflows
+        _validate_pull_policy_argument(self.config)
+
+        # Restart policy argument check for all workflows
+        _validate_restart_policy_argument(self.config)
+
+        # Namespace argument regex check for k8 workflows
+        _validate_namespace_argument(self.config)
+
+        # Check for mandatory arguments for k8 custom script workflow
+        _validate_k8_custom_script_workflow_mandatory_argument(self.config)
+
+        # Check the git url setting
+        _validate_git_url(self.config)
+
+
+def _validate_mandatory_argument(argument, argument_name: str) -> None:
+    """
+    Check if the mandatory input argument is not None.
+
+    Parameters:
+    argument : The argument to validate.
+    argument_name : The name of argument to validate.
+    """
+    if argument is None:
+        raise ValueError("Missing mandatory argument " + argument_name + " is not provided")
+
+
+def get_argument(config: DictConfig, argument_name: str):
+    """
+    Return the value of given argument_name from config
+
+    Parameters:
+    config : Configuration dictionary
+    argument_name : The name of argument to fetch.
+
+    Returns:
+    bool: Argument if present else return None
+    argument_name can have a nested key like key1.key2 in which case we will return config[key1][key2].
+    If  config[key1] is None for above case we will return None
+    """
+    argument_name_splits = argument_name.split(".")
+    if len(argument_name_splits) > 1:
+        subconfig = config.get(argument_name_splits[0])
+        if subconfig is None:
+            return None
+        argument_name_splits.pop(0)
+        remaining_argument_name = ".".join(argument_name_splits)
+        return get_argument(subconfig, remaining_argument_name)
+    return config.get(argument_name)
+
+
+def _validate_pv_arguments(config: DictConfig) -> None:
+    """
+    Check all the information needed for persistentVolumeClaim is provided if it is not None
+
+    Parameters:
+    config (DictConfig): Configuration dictionary
+    """
+    cluster_config_name = "cluster.cluster_config"
+    cluster_config = get_argument(config, cluster_config_name)
+    pv_argument_name = "persistentVolumeClaims"
+    exception_message: str = "claimName and mountPath should be provided for persistentVolumeClaim"
+    if cluster_config is not None and pv_argument_name in cluster_config:
+        pv_arguments = cluster_config.get(pv_argument_name)
+        claim_name = "claimName"
+        mount_path = "mountPath"
+        for pv_argument in pv_arguments:
+            if pv_argument is None or claim_name not in pv_argument or mount_path not in pv_argument:
+                raise ValueError(exception_message)
+            claim_name_argument = pv_argument.get(claim_name)
+            mount_path_argument = pv_argument.get(mount_path)
+            if claim_name_argument is None or mount_path_argument is None:
+                raise ValueError(exception_message)
+
+
+def _validate_volume_arguments(config: DictConfig) -> None:
+    """
+    Check all the information needed for volume is provided if it is not None
+    Parameters:
+    config (DictConfig): Configuration dictionary
+    """
+    cluster_config_name = "cluster.cluster_config"
+    cluster_config = get_argument(config, cluster_config_name)
+    volumes_argument_name = "volumes"
+    exception_message: str = "hostPath, mountPath, volumeName should be provided for volumes"
+    if cluster_config is not None and volumes_argument_name in cluster_config:
+        volume_arguments = cluster_config.get(volumes_argument_name)
+        if volume_arguments is None:
+            return
+        host_path = "hostPath"
+        mount_path = "mountPath"
+        volume_name = "volumeName"
+        print(volume_arguments)
+        for volume_argument in volume_arguments:
+            if (
+                volume_argument is None
+                or host_path not in volume_argument
+                or mount_path not in volume_argument
+                or volume_name not in volume_argument
+            ):
+                raise ValueError(exception_message)
+            host_path_argument = volume_argument.get(host_path)
+            mount_path_argument = volume_argument.get(mount_path)
+            volume_name_argument = volume_argument.get(volume_name)
+            if host_path_argument is None or mount_path_argument is None or volume_name_argument is None:
+                raise ValueError(exception_message)
+
+
+def _validate_pull_policy_argument(config: DictConfig) -> None:
+    """
+    Check only valid pullPolicy is provided if it is not None
+
+    Parameters:
+    config (DictConfig): Configuration dictionary
+    """
+    pull_policy_argument_name = "cluster.cluster_config.pullPolicy"
+    pull_policy_argument = get_argument(config, pull_policy_argument_name)
+    if pull_policy_argument is not None:
+        supported_pull_policies = ["Always", "IfNotPresent", "Never"]
+        if pull_policy_argument not in supported_pull_policies:
+            raise ValueError("Provided pullPolicy is not supported")
+
+
+def _validate_restart_policy_argument(config: DictConfig) -> None:
+    """
+    Check only valid restartPolicy is provided if it is not None
+
+    Parameters:
+    config (DictConfig): Configuration dictionary
+    """
+    restart_policy_argument_name = "cluster.cluster_config.restartPolicy"
+    restart_policy_argument = get_argument(config, restart_policy_argument_name)
+    if restart_policy_argument is not None:
+        supported_restart_policies = ["Always", "OnFailure", "Never", "ExitCode"]
+        if restart_policy_argument not in supported_restart_policies:
+            raise ValueError("Provided restartPolicy is not supported")
+
+
+def _validate_cluster_type_argument(config: DictConfig) -> None:
+    """
+    Check only valid cluster_type is provided
+
+    Parameters:
+    config (DictConfig): Configuration dictionary
+    """
+    cluster_type_argument_name = "cluster.cluster_type"
+    cluster_type = get_argument(config, cluster_type_argument_name)
+    supported_cluster_types = ["slurm", "k8s", "sm_jobs"]
+    if cluster_type is not None and cluster_type not in supported_cluster_types:
+        raise ValueError("Provided cluster_type is not supported")
+
+
+def _validate_namespace_argument(config: DictConfig) -> None:
+    """
+    Check only valid kubectl namespace is provided
+    Naming Convention of Kubernetes Namespaces is
+    You can create a name with a maximum length of 253 characters using only alphanumeric characters and hyphens.
+    Names cannot start with a hyphen and the alpha characters can only be lowercase.
+
+    Parameters:
+    config (DictConfig): Configuration dictionary
+    """
+
+    """
+    Here's a breakdown of the regex pattern:
+
+    ^ - Asserts the position at the start of the string.
+    (?!-) - A negative lookahead assertion to ensure the string does not start with a hyphen.
+    [a-z0-9-] - Matches any lowercase letter, digit, or hyphen.
+    {1,253} - Specifies that the string must be between 1 and 253 characters long.
+    $ - Asserts the position at the end of the string.
+    This pattern will ensure the string is constructed using only lowercase alphanumeric characters and hyphens,
+    is no longer than 253 characters, and does not start with a hyphen.
+    """
+    namespace_regex = r"^(?!-)[a-z0-9-]{1,253}$"
+    namespace_argument_name = "cluster.cluster_config.namespace"
+    namespace_argument = get_argument(config, namespace_argument_name)
+    if namespace_argument is not None and not re.match(namespace_regex, namespace_argument):
+        raise ValueError(
+            "Provided namespace is not valid, Kindly provide Kubernetes Namespace "
+            "with a maximum length of 253 characters using only alphanumeric characters and hyphens. "
+            "Names cannot start with a hyphen and the alpha characters can only be lowercase."
+        )
+
+
+def _validate_all_mandatory_argument(config: DictConfig) -> None:
+    mandatory_arguments_for_all_workflows = ["base_results_dir"]
+    for argument_name in mandatory_arguments_for_all_workflows:
+        argument = get_argument(config, argument_name)
+        _validate_mandatory_argument(argument, argument_name)
+
+
+def _validate_k8_custom_script_workflow_mandatory_argument(config: DictConfig) -> None:
+    cluster_type = get_argument(config, "cluster.cluster_type")
+    training_cfg = get_argument(config, "training_cfg")
+    if cluster_type == "k8s" and training_cfg is not None:
+        k8_custom_script_mandatory_arguments = [
+            "container",
+            "env_vars",
+            "training_cfg",
+            "training_cfg.entry_script",
+            "training_cfg.run",
+            "training_cfg.run.name",
+            "training_cfg.run.nodes",
+            "cluster",
+            "cluster.cluster_type",
+            "cluster.instance_type",
+            "cluster.cluster_config",
+        ]
+        for argument_name in k8_custom_script_mandatory_arguments:
+            argument = get_argument(config, argument_name)
+            _validate_mandatory_argument(argument, argument_name)
+
+
+def _validate_git_url(config: DictConfig) -> None:
+    repo_url_or_path = get_argument(config, "git.repo_url_or_path")
+    if repo_url_or_path is not None:
+        if repo_url_or_path.startswith("git@"):
+            raise ValueError("Currently we do not support to clone repo use ssh, please use http with token instead")
diff --git a/launcher/efa.py b/launcher/efa.py
new file mode 100644
index 0000000..a234680
--- /dev/null
+++ b/launcher/efa.py
@@ -0,0 +1,147 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+efa_supported_instance = set(
+    [
+        "c5n.18xlarge",
+        "c5n.9xlarge",
+        "c5n.metal",
+        "c6a.48xlarge",
+        "c6a.metal",
+        "c6gn.16xlarge",
+        "c6i.32xlarge",
+        "c6i.metal",
+        "c6id.32xlarge",
+        "c6id.metal",
+        "c6in.32xlarge",
+        "c6in.metal",
+        "c7a.48xlarge",
+        "c7a.metal-48xl",
+        "c7g.16xlarge",
+        "c7g.metal",
+        "c7gd.16xlarge",
+        "c7gd.metal",
+        "c7gn.16xlarge",
+        "c7gn.metal",
+        "c7i.48xlarge",
+        "c7i.metal-48xl",
+        "dl1.24xlarge",
+        "dl2q.24xlarge",
+        "g4dn.12xlarge",
+        "g4dn.16xlarge",
+        "g4dn.8xlarge",
+        "g4dn.metal",
+        "g5.12xlarge",
+        "g5.16xlarge",
+        "g5.24xlarge",
+        "g5.48xlarge",
+        "g5.8xlarge",
+        "g6.12xlarge",
+        "g6.16xlarge",
+        "g6.24xlarge",
+        "g6.48xlarge",
+        "g6.8xlarge",
+        "gr6.8xlarge",
+        "i3en.12xlarge",
+        "i3en.24xlarge",
+        "i3en.metal",
+        "i4g.16xlarge",
+        "i4i.32xlarge",
+        "i4i.metal",
+        "im4gn.16xlarge",
+        "inf1.24xlarge",
+        "m5dn.24xlarge",
+        "m5dn.metal",
+        "m5n.24xlarge",
+        "m5n.metal",
+        "m5zn.12xlarge",
+        "m5zn.metal",
+        "m6a.48xlarge",
+        "m6a.metal",
+        "m6i.32xlarge",
+        "m6i.metal",
+        "m6id.32xlarge",
+        "m6id.metal",
+        "m6idn.32xlarge",
+        "m6idn.metal",
+        "m6in.32xlarge",
+        "m6in.metal",
+        "m7a.48xlarge",
+        "m7a.metal-48xl",
+        "m7g.16xlarge",
+        "m7g.metal",
+        "m7gd.16xlarge",
+        "m7gd.metal",
+        "m7i.48xlarge",
+        "m7i.metal-48xl",
+        "p3dn.24xlarge",
+        "p4d.24xlarge",
+        "p4de.24xlarge",
+        "p5.48xlarge",
+        "r5dn.24xlarge",
+        "r5dn.metal",
+        "r5n.24xlarge",
+        "r5n.metal",
+        "r6a.48xlarge",
+        "r6a.metal",
+        "r6i.32xlarge",
+        "r6i.metal",
+        "r6id.32xlarge",
+        "r6id.metal",
+        "r6idn.32xlarge",
+        "r6idn.metal",
+        "r6in.32xlarge",
+        "r6in.metal",
+        "r7a.48xlarge",
+        "r7a.metal-48xl",
+        "r7g.16xlarge",
+        "r7g.metal",
+        "r7gd.16xlarge",
+        "r7gd.metal",
+        "r7i.48xlarge",
+        "r7i.metal-48xl",
+        "r7iz.32xlarge",
+        "r7iz.metal-32xl",
+        "r8g.24xlarge",
+        "r8g.48xlarge",
+        "r8g.metal-24xl",
+        "r8g.metal-48xl",
+        "trn1.32xlarge",
+        "trn1n.32xlarge",
+        "u7i-12tb.224xlarge",
+        "u7in-16tb.224xlarge",
+        "u7in-24tb.224xlarge",
+        "u7in-32tb.224xlarge",
+        "vt1.24xlarge",
+        "x2idn.32xlarge",
+        "x2idn.metal",
+        "x2iedn.32xlarge",
+        "x2iedn.metal",
+        "x2iezn.12xlarge",
+        "x2iezn.metal",
+    ]
+)
+
+instanceWithMultipleEFAs = {
+    "p4d.24xlarge": 4,
+    "p4de.24xlarge": 4,
+    "p5.4xlarge": 4,
+    "p5.24xlarge": 16,
+    "p5.48xlarge": 32,
+    "trn1.32xlarge": 8,
+    "trn1n.32xlarge": 16,
+}
+
+instanceWithRDMASupport = set(
+    ["p4d.24xlarge", "p4de.24xlarge", "p5.4xlarge", "p5.24xlarge", "p5.48xlarge", "trn1.32xlarge", "trn1n.32xlarge"]
+)
diff --git a/launcher/nemo/README.md b/launcher/nemo/README.md
new file mode 100644
index 0000000..5370d06
--- /dev/null
+++ b/launcher/nemo/README.md
@@ -0,0 +1,16 @@
+# Core NeMo launching implementations
+This folder contains the core launching framework for NeMo based implementations. We use the same design as the [NeMo-Framework-Launcher](https://github.com/NVIDIA/NeMo-Framework-Launcher/tree/main). Bsaically there are 2 steps:
+- A stage defined in `stages.py` will prepare for the training script launching command and the cluster configs, passing these configs into the actual launcher
+- A launcher defined in `launchers.py` will take the configs from the stage and generate the real launching script. Then launcher will kick off the run using corresponding cluster methods, i.e. slurm or k8s.
+
+## Stages
+We support different use cases, and each will be corresponding to a stage:
+- `SMTraining`: Stage to run native NeMo workload
+- `SMTrainingGPURecipe`: Stage used to run our GPU recipes
+- `SMTrainingTrainiumRecipe`: Stage to run our Trainium recipes
+- `SMCustomTrainingGPU`: Stage for training with custom script on GPU
+- `SMCustomTrainingTrainium`: Stage for training with custom script on Trainium
+
+## Launchers
+Currently we only need our own launchers for custom jobs, because we need to manage the `torchrun` command
+- `SMSlurmLauncher`: Launcher for custom jobs using slurm
diff --git a/launcher/nemo/__init__.py b/launcher/nemo/__init__.py
new file mode 100644
index 0000000..6549052
--- /dev/null
+++ b/launcher/nemo/__init__.py
@@ -0,0 +1,12 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
diff --git a/launcher/nemo/constants.py b/launcher/nemo/constants.py
new file mode 100644
index 0000000..3b881fb
--- /dev/null
+++ b/launcher/nemo/constants.py
@@ -0,0 +1,31 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+from pathlib import Path
+
+SM_ADAPTER_REPO = "https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git"
+NEMO_REPO = "https://github.com/NVIDIA/NeMo.git"
+NEMO_REPO_TAG = "v2.0.0rc0"  # [TODO] move to v2.0.0 once it is released
+
+SM_ADAPTER_MODEL_TYPE_TO_CODE_PATH = {
+    "llama": "examples/llama/llama_pretrain.py",
+    "mistral": "examples/mistral/mistral_pretrain.py",
+    "mixtral": "examples/mixtral/mixtral_pretrain.py",
+}
+
+NEURONX_REPO_URI = "https://github.com/aws-neuron/neuronx-distributed-training.git"
+NEURONX_REPO_TAG = "main"
+NEURONX_CONF_PATH = "examples/conf"
+
+# utility directory to more easily navigate to other parts of the package
+ROOT_DIR = Path(__file__).resolve().parent.parent.parent  # package root
diff --git a/launcher/nemo/k8s_templates/training/Chart.yaml b/launcher/nemo/k8s_templates/training/Chart.yaml
new file mode 100644
index 0000000..5665246
--- /dev/null
+++ b/launcher/nemo/k8s_templates/training/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: Sagemaker Model Training
+name: sagemaker-training
+version: 1.0.0
diff --git a/launcher/nemo/k8s_templates/training/train-script-gpu.yaml b/launcher/nemo/k8s_templates/training/train-script-gpu.yaml
new file mode 100644
index 0000000..1cdc73e
--- /dev/null
+++ b/launcher/nemo/k8s_templates/training/train-script-gpu.yaml
@@ -0,0 +1,54 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: train-script-gpu-{{ $config.jobName }}
+data:
+  train-script.sh: |
+    #!/bin/bash
+    set -ex
+
+    {{- if $config.git.repo_url_or_path }}
+    mkdir -p $HOME/tmp
+    GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME
+    [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
+    git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
+    GIT_CLONE_DIR=${GIT_CLONE_DIR}/
+    cd $GIT_CLONE_DIR
+
+      {{- if $config.git.branch }}
+    git checkout {{ $config.git.branch }}
+      {{- end }}
+
+      {{- if $config.git.commit }}
+    git fetch origin {{ $config.git.commit }}
+    git reset --hard {{ $config.git.commit }}
+      {{- end }}
+    {{- else }}
+    GIT_CLONE_DIR=""
+    {{- end }}
+
+    {{- range $config.pre_script }}
+    {{ . }}
+    {{- end }}
+
+    {{- if gt (int $config.nodes) 1 }}
+    export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --rdzv_backend=c10d --rdzv_endpoint={{ $config.jobName }}-worker-0"
+    {{- else }}
+    export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}"
+    {{- end }}
+
+    echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
+    torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }} \
+    {{- if $config.scriptArgs -}}
+    {{ $config.scriptArgs }}
+    {{- end }}
+
+    {{- range $config.post_script }}
+    {{ . }}
+    {{- end }}
+
+    {{- if $config.git.repo_url_or_path }}
+    cd $HOME
+    rm -rf $GIT_CLONE_DIR
+    {{- end }}
diff --git a/launcher/nemo/k8s_templates/training/train-script-trn.yaml b/launcher/nemo/k8s_templates/training/train-script-trn.yaml
new file mode 100644
index 0000000..d07f451
--- /dev/null
+++ b/launcher/nemo/k8s_templates/training/train-script-trn.yaml
@@ -0,0 +1,111 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: train-script-trn-{{ $config.jobName }}
+data:
+  train-script.sh: |
+    #!/usr/bin/env bash
+
+    set -o pipefail
+    set -ex
+
+    {{- if $config.git.repo_url_or_path }}
+    mkdir -p $HOME/tmp
+    GIT_CLONE_DIR=$HOME/tmp/$HOSTNAME
+    [[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
+    git clone {{ $config.git.repo_url_or_path }} $GIT_CLONE_DIR
+    GIT_CLONE_DIR=${GIT_CLONE_DIR}/
+    cd $GIT_CLONE_DIR
+
+      {{- if $config.git.branch }}
+    git checkout {{ $config.git.branch }}
+      {{- end }}
+
+      {{- if $config.git.commit }}
+    git fetch origin {{ $config.git.commit }}
+    git reset --hard {{ $config.git.commit }}
+      {{- end }}
+    {{- else }}
+    GIT_CLONE_DIR=""
+    {{- end }}
+
+    {{- range $config.pre_script }}
+    {{ . }}
+    {{- end }}
+
+    {{- if gt (int $config.nodes) 1 }}
+    hostname=$(hostname)
+    prefix="{{ $config.jobName }}-worker-"
+    echo "prefix is $prefix"
+    node_id=${hostname#"$prefix"}
+    export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }} --nnodes {{ $config.nodes }} --node_rank $node_id --master_addr={{ $config.jobName }}-worker-0 --master_port 41000"
+    {{- else }}
+    export DISTRIBUTED_ARGS="--nproc_per_node {{ $config.ntasksPerNode }}"
+    {{- end }}
+
+    {{- if $config.customScript }}
+    # Custom script provided
+    echo "DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
+    torchrun $DISTRIBUTED_ARGS ${GIT_CLONE_DIR}{{ $config.scriptPath }}
+      {{- if $config.scriptArgs -}} \
+      {{ $config.scriptArgs }}
+      {{- end }}
+    {{- else }}
+    # Recipe provided
+    # Implementation from NeuronxDistributedTraining's train_setup.sh (https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train_setup.sh) and train.sh (https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train.sh)
+    ulimit -n 65535
+
+    # removed the `sudo` commands in this block since it leads to permission errors
+    sysctl -w net.ipv4.ip_local_reserved_ports=41000
+    if which lctl >/dev/null 2>&1; then
+        lctl set_param 'osc.*.max_dirty_mb=64' # Cap max space each connection to FSx reserves so we avoid OODs
+    fi
+
+    export FI_EFA_USE_DEVICE_RDMA=1
+    export FI_PROVIDER=efa
+    export FI_EFA_FORK_SAFE=1
+    export XLA_DISABLE_FUNCTIONALIZATION=0
+    export HYDRA_FULL_ERROR=1
+    export MALLOC_ARENA_MAX=128
+    export CREATE_TB_LOGGER=True
+    export CHECKPOINT_CALLBACK=True
+
+    # Place cache on shared storage to reduce redundant compilations
+    export NEURON_COMPILE_CACHE_URL="/{{ (index $config.persistentVolumeClaims 0).mountPath }}/neuron_cache"
+    mkdir -p $NEURON_COMPILE_CACHE_URL
+
+      {{- if eq (int $config.compile) 1 }}
+    MAYBE_COMPILE="neuron_parallel_compile"
+      {{- end }}
+
+    echo "env MAYBE_COMPILE=$MAYBE_COMPILE"
+    echo "env DISTRIBUTED_ARGS=$DISTRIBUTED_ARGS"
+    # End of block
+      {{- if $config.git.repo_url_or_path }}
+    # copy the training conf generated by the Launcher into Neuron's repo
+    cp -f /config/config.yaml examples/conf/launcher_config.yaml
+
+    # cd into directory containing training_orchestrator.py and /conf
+    cd examples
+
+      {{- end }}
+    $MAYBE_COMPILE torchrun $DISTRIBUTED_ARGS training_orchestrator.py  \
+      --config-path=conf \
+      --config-name=launcher_config \
+      trainer.devices={{ $config.ntasksPerNode | default 32 }} \
+      trainer.num_nodes={{ $config.nodes }}
+
+    # return to top-level directory
+    cd ..
+    {{- end }}
+
+    {{- range $config.post_script }}
+    {{ . }}
+    {{- end }}
+
+    {{- if $config.git.repo_url_or_path }}
+    cd $HOME
+    rm -rf $GIT_CLONE_DIR
+    rm -rf $NEURON_COMPILE_CACHE_URL
+    {{- end }}
diff --git a/launcher/nemo/k8s_templates/training/training-config.yaml b/launcher/nemo/k8s_templates/training/training-config.yaml
new file mode 100644
index 0000000..64e7924
--- /dev/null
+++ b/launcher/nemo/k8s_templates/training/training-config.yaml
@@ -0,0 +1,8 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: training-config-{{ $config.jobName }}
+data:
+  config.yaml: |-
+  {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
diff --git a/launcher/nemo/k8s_templates/training/training.yaml b/launcher/nemo/k8s_templates/training/training.yaml
new file mode 100644
index 0000000..b40d89c
--- /dev/null
+++ b/launcher/nemo/k8s_templates/training/training.yaml
@@ -0,0 +1,177 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: {{ $config.jobName }}
+  namespace: {{ $config.namespace }}
+  {{- if $config.annotations }}
+  annotations:
+    {{- range $key, $value := $config.annotations }}
+    {{ $key | quote }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+  labels:
+    app: {{ $config.jobName }}
+    {{- if $config.customLabels }}
+    {{- range $key, $value := $config.customLabels }}
+    {{ $key | quote }}: {{ $value | quote }}
+    {{- end}}
+    {{- end }}
+spec:
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: {{ $config.nodes }}
+      template:
+        {{- if $config.customLabels }}
+        metadata:
+          labels:
+            {{- range $key, $value := $config.customLabels }}
+            {{ $key | quote }}: {{ $value | quote }}
+            {{- end }}
+        {{- end }}
+        spec:
+          {{- if $config.priorityClassName }}
+          priorityClassName: {{ $config.priorityClassName }}
+          {{- end}}
+          {{- if $config.serviceAccountName }}
+          serviceAccountName: {{ $config.serviceAccountName }}
+          {{- end }}
+          containers:
+          - name: pytorch
+            image: {{ .Values.image.trainingImage }}
+            env:
+              {{- range $key, $value := $config.envVars }}
+              - name: {{ $key }}
+                value: {{ $value | quote }}
+              {{- end}}
+            command:
+            - /etc/config/train-script.sh
+            imagePullPolicy: {{ .Values.image.pullPolicy }}
+            securityContext:
+              capabilities:
+                add: [ "IPC_LOCK" ]
+            {{- if or (eq $config.device "gpu") (eq $config.device "trainium") (gt (int $config.numEFADevices) 0 ) }}
+            resources:
+              requests:
+                {{- if eq $config.device "gpu" }}
+                nvidia.com/gpu: {{ $config.ntasksPerNode }}
+                {{- end }}
+                {{- if eq $config.device "trainium" }}
+                aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
+                {{- end }}
+                {{- if gt (int $config.numEFADevices) 0 }}
+                vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
+                {{- end }}
+              limits:
+                {{- if eq $config.device "gpu" }}
+                nvidia.com/gpu: {{ $config.ntasksPerNode }}
+                {{- end }}
+                {{- if eq $config.device "trainium" }}
+                aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
+                {{- end }}
+                {{- if gt (int $config.numEFADevices) 0 }}
+                vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
+                {{- end }}
+            {{- end }}
+            volumeMounts:
+            {{- if $config.persistentVolumeClaims }}
+            {{- range $config.persistentVolumeClaims }}
+            - mountPath: {{ .mountPath }}
+              name: {{ .claimName }}-volume
+            {{- end }}
+            {{- end }}
+            {{- if $config.volumes }}
+            {{- range $config.volumes }}
+            - name: {{ .volumeName }}
+              mountPath: {{ .mountPath }}
+            {{- end }}
+            {{- end }}
+            {{- if not $config.customScript }}
+            - mountPath: /config
+              name: training-config
+            {{- end }}
+            - mountPath: /etc/config
+              name: train-script
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /var/log/aws/clusters
+              name: aws-clusters-logs
+              readOnly: true
+          restartPolicy: {{ $config.restartPolicy }}
+
+          {{- if (or $config.labelSelector.required $config.labelSelector.preferred) }}
+          affinity:
+            nodeAffinity:
+            {{- if $config.labelSelector.required }}
+              {{- range $key, $values := $config.labelSelector.required }}
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: {{ $key | quote }}
+                      operator: In
+                      values:
+                        {{- range $values }}
+                        - {{ . | quote }}
+                        {{- end}}
+              {{- end }}
+            {{- end }}
+
+            {{- if $config.labelSelector.preferred }}
+              {{- $index := 0 }}
+              {{- range $key, $values := $config.labelSelector.preferred }}
+              preferredDuringSchedulingIgnoredDuringExecution:
+                - weight: {{ index $config.labelSelector.weights $index }}
+                  preference:
+                    matchExpressions:
+                      - key: {{ $key | quote }}
+                        operator: In
+                        values:
+                          {{- range $values }}
+                          - {{ . | quote }}
+                          {{- end }}
+              {{- $index = add $index 1 }}
+              {{- end }}
+            {{- end }}
+          {{- end }}
+
+          volumes:
+          {{- if $config.persistentVolumeClaims }}
+          {{- range $config.persistentVolumeClaims }}
+          - name: {{ .claimName }}-volume
+            persistentVolumeClaim:
+              claimName: {{ .claimName }}
+          {{- end }}
+          {{- end }}
+          {{- if $config.volumes }}
+          {{- range $config.volumes }}
+            - name: {{ .volumeName }}
+              hostPath:
+                path: {{ .hostPath }}
+                type: Directory
+          {{- end }}
+          {{- end }}
+          {{- if not $config.customScript }}
+          - configMap:
+              name: training-config-{{ $config.jobName }}
+            name: training-config
+          {{- end }}
+          - name: shm
+            hostPath:
+              path: /dev/shm
+              type: Directory
+          - name: aws-clusters-logs
+            hostPath:
+              path: /var/log/aws/clusters
+              type: DirectoryOrCreate
+          - name: train-script
+            configMap:
+              defaultMode: 420
+              items:
+              - key: train-script.sh
+                mode: 365
+                path: train-script.sh
+              {{- if eq $config.device "trainium" }}
+              name: train-script-trn-{{ $config.jobName }}
+              {{- else }}
+              name: train-script-gpu-{{ $config.jobName }}
+              {{- end }}
diff --git a/launcher/nemo/k8s_templates/training/values.yaml b/launcher/nemo/k8s_templates/training/values.yaml
new file mode 100644
index 0000000..81282c9
--- /dev/null
+++ b/launcher/nemo/k8s_templates/training/values.yaml
@@ -0,0 +1,83 @@
+image:
+  # training image
+  trainingImage: cfg.container
+
+  # image pulling policy
+  pullPolicy: IfNotPresent
+
+
+trainingConfig:
+  # current job name
+  jobName: "nil"
+
+  # namespace to launch job
+  namespace: "default"
+
+  # script path
+  scriptPath: null
+
+  # script args
+  scriptArgs: null
+
+  # specify whether to use custom scripts
+  customScript: null
+
+  # list of custom annotations apply to jobs
+  annotations: null
+
+  # list of custom labels apply to jobs and pods
+  customLabels: null
+
+  # Kueue scheduler priority class name
+  priority_class_name: null
+
+  # device type, can be "gpu", "trainium" and "nil", "nil" means cpu
+  device: "nil"
+
+  # number of EFA devices if the instance type support EFA
+  numEFADevices: 0
+
+  # number of Neuron devices if job is for Trainium
+  numNeuronDevices: null
+
+  # number of process per node
+  ntasksPerNode: 0
+
+  # number of nodes to run
+  nodes: training.trainer.num_nodes
+
+  # restart policy
+  restartPolicy: Never
+
+  # from NeMo, not used currently
+  wandbKey: "nil"
+
+  # name of service account associated with the namespace
+  serviceAccountName: null
+
+  # relevant for Trainium chips, either 0 or 1
+  compile: 0
+
+  # persistent volume, usually used to mount FSx
+  persistentVolumeClaims: null
+
+  # temp volume, usually used to mount temp file in the host
+  volumes: null
+
+  # A github repo if user might want to use script inside
+  git:
+    repo_url_or_path: null
+    branch: null
+    commit: null
+    token: null
+
+  # Commands to run before training
+  pre_script: []
+  # Commands to run after training
+  post_script: []
+
+  # select preferred and required labels for nodes
+  labelSelector:
+    required: null # select nodes with required labels
+    preferred: null # select nodes with priority which has preferred labels
+    weights: null # list of weights for the preferred labels
diff --git a/launcher/nemo/launchers.py b/launcher/nemo/launchers.py
new file mode 100755
index 0000000..d48321a
--- /dev/null
+++ b/launcher/nemo/launchers.py
@@ -0,0 +1,92 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+from pathlib import Path
+from typing import List
+
+import nemo_launcher.utils.job_utils as job_utils
+from nemo_launcher.core.launchers import AutoLauncher, K8SLauncher, Launcher
+
+from .slurm_launcher import SMSlurmLauncher
+
+
+class SMAutoLauncher(AutoLauncher):
+    """
+    AutoLauncher object for Sagemaker
+    """
+
+    @staticmethod
+    def get_launchers():
+        """Returns supported launchers as a dictionary from launcher name to launcher class"""
+        return {
+            "bcm": SMSlurmLauncher,
+            "k8s": SMK8SLauncher,
+            "sm_jobs": SMJobsLauncher,
+        }
+
+
+class SMK8SLauncher(K8SLauncher):
+    """
+    Launcher for SM training jobs using K8s.
+    """
+
+    def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
+        """
+        Generate the script to launch the Helm chart.
+        A very simple bash script is generated which runs `helm install` for the
+        Helm chart that was generated.
+
+        :param List[List[str]] command_groups: Command groups to launch with
+        :return: submission script file's text
+        :rtype: str
+        """
+        paths = job_utils.JobPaths(folder=self.folder, job_name=self.job_name)
+        helm_charts = paths.folder / "k8s_template"
+        job_name = self.job_name.replace("_", "-")
+
+        extra_helm_args = ""
+        if self.parameters.get("namespace", None):
+            extra_helm_args += f" --namespace {self.parameters['namespace']}"
+
+        # Apply a timeout of 15min in case images take a long time to bring up
+        # or pre-install hooks take a while
+        return f"#!/bin/bash\nhelm install --timeout=15m --wait {extra_helm_args} {job_name} {helm_charts}\n"
+
+
+class SMJobsLauncher(Launcher):
+    def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
+        """
+        Given the command groups, generate submission script file's text.
+        Command groups is a list of command group. A command group is defined as:
+              0. Command group is a list of command strings
+              1. Each command group occupies one bcprun, srun or bash
+              2. Each command group eventually has multiple commands connected by ";"
+        On interactive cluster, multi-gpu python scripts are launched with `torchrun --nproc_per_node=??`
+
+        :param List[List[str]] command_groups: Command groups to launch with
+        :return: submission script file's text
+        :rtype: str
+        """
+        # now create
+        lines = ["#!/bin/bash", ""]
+
+        for group_ind, command_group in enumerate(command_groups):
+            command = "\n".join(command_group)
+            lines.append(command)
+        return "\n".join(lines)
+
+    def _submit_command(self, submission_file_path: Path) -> str:
+        command_list = ["bash", submission_file_path]
+        # run
+        job_utils.CommandFunction(command_list, ret_stdout=False, verbose=False)()  # explicit errors
+        return ""
diff --git a/launcher/nemo/nemo_framework_launcher b/launcher/nemo/nemo_framework_launcher
new file mode 160000
index 0000000..3d41c31
--- /dev/null
+++ b/launcher/nemo/nemo_framework_launcher
@@ -0,0 +1 @@
+Subproject commit 3d41c31c91d5a47a84ad15cbf783c56700c30521
diff --git a/launcher/nemo/recipe_stages.py b/launcher/nemo/recipe_stages.py
new file mode 100755
index 0000000..5ff4e85
--- /dev/null
+++ b/launcher/nemo/recipe_stages.py
@@ -0,0 +1,161 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+from pathlib import Path
+from typing import Dict, List
+
+from omegaconf import OmegaConf
+
+from ..accelerator_devices import get_num_accelerator_devices
+from .constants import (
+    NEMO_REPO,
+    NEMO_REPO_TAG,
+    NEURONX_CONF_PATH,
+    NEURONX_REPO_TAG,
+    NEURONX_REPO_URI,
+    ROOT_DIR,
+    SM_ADAPTER_MODEL_TYPE_TO_CODE_PATH,
+    SM_ADAPTER_REPO,
+)
+from .stages import SMTraining, get_num_nodes, set_multinode_envs
+
+
+class SMTrainingGPURecipe(SMTraining):
+    """
+    Stage used to run our GPU recipes
+    """
+
+    @property
+    def _default_repo(self):
+        return SM_ADAPTER_REPO
+
+    @property
+    def _entry_script_path(self) -> Path:
+        # [TODO] Handle generate the script path from github
+        choice_model_type, _ = self.get_stage_config_choice()
+        choice_model_type = choice_model_type.split("/")[1]
+        # predefined model
+        if choice_model_type in SM_ADAPTER_MODEL_TYPE_TO_CODE_PATH:
+            return Path(SM_ADAPTER_MODEL_TYPE_TO_CODE_PATH[choice_model_type])
+        # custom model
+        return Path("examples/custom_model/custom_pretrain.py")
+
+    def get_stage_config_choice(self):
+        # [TODO] check if need to override
+        return super().get_stage_config_choice()
+
+
+class NeMoTraining(SMTraining):
+    """
+    Stage to run NeMo recipes
+    """
+
+    @property
+    def _nemo_code_path(self) -> Path:
+        return Path("")
+
+    @property
+    def _default_repo(self):
+        return NEMO_REPO
+
+    @property
+    def _default_branch(self):
+        return NEMO_REPO_TAG
+
+    @property
+    def _entry_script_path(self) -> Path:
+        choice_model_type, _ = self.get_stage_config_choice()
+        choice_model_type = choice_model_type.split("/")[1]
+        code_path = self._get_nemo_code_path(choice_model_type)
+        return Path(code_path)
+
+
+class SMTrainingTrainiumRecipe(SMTraining):
+    """
+    Stage to run our Trainium recipes
+    """
+
+    DEFAULT_TRAIN_SCRIPT_PATH = "examples/train.sh"
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.device = "trainium"
+
+        # Used by Slurm and K8s. Example: "llama/megatron_llama_7B_config"
+        self._training_filename = self.cfg.training_config.rsplit("/", 1)[-1]
+        self._temp_training_conf_file = ROOT_DIR / f"tmp/training/{self._training_filename}.yaml"
+
+        if not self._temp_training_conf_file.parent.exists():
+            self._temp_training_conf_file.parent.mkdir(parents=True)
+
+    @property
+    def _default_repo(self):
+        return NEURONX_REPO_URI
+
+    @property
+    def _default_branch(self):
+        return NEURONX_REPO_TAG
+
+    @property
+    def _entry_script_path(self) -> Path:
+        cfg_git_entry_script = self.cfg.get("git", {}).get("entry_script")
+        entry_script_path = cfg_git_entry_script or self.DEFAULT_TRAIN_SCRIPT_PATH
+        return Path(entry_script_path)
+
+    def _make_custom_call_string(self, stage_cfg_path=None):
+        """
+        Create the command that runs the training script
+        """
+        compile = OmegaConf.select(self.cfg, "recipes.run.compile", default=0)
+
+        commands: List[str] = [
+            "# copy the resolved training config file into the cloned Neuronx repo",
+            f"cp -f {self._temp_training_conf_file} {NEURONX_CONF_PATH}",
+            "",
+            "# training script depends on other files invoked with relative paths, so must cd into it",
+            f'cd "$(dirname {self._entry_script_path})"',
+            "",
+            "# run training script but first define its arguments",
+            f"export CONF_FILE={self._training_filename}",
+            f"export COMPILE={compile}",
+            f'bash ./"$(basename {self._entry_script_path})"',
+            "",
+        ]
+        return "\n".join(commands)
+
+    def update_stage_specific_k8s_values(self, values_template):
+        """
+        training specifc k8s values for trainum
+        """
+        super().update_stage_specific_k8s_values(values_template)
+        values_template.trainingConfig.numNeuronDevices = get_num_accelerator_devices(self.instance_type)
+        return values_template
+
+    def get_env_vars(self) -> Dict:
+        """
+        Set up dictionary for environment variables
+        By default injecting the EFA env variable when doing multi-node training
+        The environment variables from hydra config will be set inside the job scripts.
+        For Example:
+            Set `env_vars.NVTE_BIAS_DROPOUT_FUSION=1` while calling nemo_launcherlauncher-scripts,
+            `NVTE_BIAS_DROPOUT_FUSION=1` will be set while running the job.
+
+        :return: a dictionary of env vars while running the job.
+        :rtype: Dict
+        """
+        env_vars = super().get_env_vars()
+        stage_cfg = self.stage_cfg
+        nodes = get_num_nodes(stage_cfg)
+        if int(nodes) > 1:
+            env_vars = set_multinode_envs(env_vars, self.instance_type)
+        return env_vars
diff --git a/launcher/nemo/slurm_launcher.py b/launcher/nemo/slurm_launcher.py
new file mode 100644
index 0000000..ee31cda
--- /dev/null
+++ b/launcher/nemo/slurm_launcher.py
@@ -0,0 +1,147 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import os
+import shutil
+from pathlib import Path
+from typing import Any, List, Union
+
+import nemo_launcher.utils.job_utils as job_utils
+from nemo_launcher.core.launchers import SlurmLauncher
+from nemo_launcher.core.logger import logger
+
+NEMO_LAUNCHER_DEBUG = os.getenv("NEMO_LAUNCHER_DEBUG", "False").lower() in (
+    "true",
+    "t",
+    "1",
+)
+
+
+class SMJobPaths(job_utils.JobPaths):
+    """
+    Our launcher contains an extra entry script called train_script.sh
+    This class is used to specify its path
+    """
+
+    @property
+    def train_script_file(self) -> Path:
+        return self._folder / f"train_script.sh"
+
+    @property
+    def launch_docker_container_file(self) -> Path:
+        return self._folder / f"launch_docker_container.sh"
+
+    @property
+    def docker_exec_script_file(self) -> Path:
+        return self._folder / f"docker_exec_script.sh"
+
+
+class SMSlurmLauncher(SlurmLauncher):
+    """
+    Launcher for SM training jobs using slurm.
+    This launcher will launch the job using `torchrun`, unlike the NeMo slurm launcher which use Pytorch lightning
+    to handle the torch.distributed. This launcher will create a separate train_script.sh with proper `torchrun` distributed arg prepared.
+    Checking `_make_train_script_text` function in stage.py for more details.
+    """
+
+    def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None:
+        # We need to handle this ntasks_per_node specifically
+        # Since we are using torchrun to launch custom jobs, we can not use ntasks_per_node in sbatch command
+        self.ntasks_per_node = kwargs.pop("ntasks_per_node", 8)
+        if "train_script_text" in kwargs:
+            self.train_script_text = kwargs.pop("train_script_text")
+        else:
+            raise ValueError(f"Missing train_script_text from launcher kwargs {kwargs}")
+        self.launch_docker_container_text = kwargs.pop("launch_docker_container_text", None)
+        self.docker_exec_script_text = kwargs.pop("docker_exec_script_text", None)
+        self.slurm_create_submission_file_only = kwargs.pop("slurm_create_submission_file_only", False)
+        if "hostfile" in kwargs:
+            self.hostfile = kwargs.pop("hostfile")
+        else:
+            raise ValueError(f"Missing hostfile from launcher kwargs {kwargs}")
+        if "slurm_docker_cfg" in kwargs:
+            kwargs.pop("slurm_docker_cfg")
+        super(SlurmLauncher, self).__init__(folder, job_name)
+        self.parameters = {}
+        self._update_parameters(job_name=job_name, **kwargs)
+        if shutil.which("srun") is None and not NEMO_LAUNCHER_DEBUG and not self.slurm_create_submission_file_only:
+            raise RuntimeError('Could not detect "srun", are you indeed on a slurm cluster?')
+
+    def _make_train_script_file(self):
+        """
+        Create the custom train_script.sh
+        Optional create launch_docker_container.sh to launch docker container on every node
+        """
+        job_paths = SMJobPaths(folder=self.folder, job_name=self.job_name)
+        folder = job_paths.folder
+        folder.mkdir(parents=True, exist_ok=True)
+        train_script_file_path = job_paths.train_script_file
+        with train_script_file_path.open("w") as f:
+            f.write(self.train_script_text)
+        if self.launch_docker_container_text is not None:
+            launch_docker_container_file = job_paths.launch_docker_container_file
+            with launch_docker_container_file.open("w") as f:
+                f.write(self.launch_docker_container_text)
+        if self.docker_exec_script_text is not None:
+            docker_exec_script_file = job_paths.docker_exec_script_file
+            with docker_exec_script_file.open("w") as f:
+                f.write(self.docker_exec_script_text)
+
+    def launch(self, command_groups: List[List[str]]) -> str:
+        # Create the custom train_script.sh before launching the real job
+        self._make_train_script_file()
+
+        # Same as upstream, but exposing extra control for submission through slurm_create_submission_file_only
+        submission_file_path = self._make_submission_file(command_groups)
+        logger.info(f"Job {self.job_name} submission file created at '{submission_file_path}'")
+        job_id = ""
+        if not NEMO_LAUNCHER_DEBUG and not self.slurm_create_submission_file_only:
+            job_id = self._submit_command(submission_file_path)
+            if job_id:
+                logger.info(f"Job {self.job_name} submitted with Job ID {job_id}")
+                with open(self.folder / "launcher.log", "w") as f:
+                    f.write(f"Submitted batch job {job_id}")
+        else:
+            logger.info(f"To submit your job on Slurm, run `sbatch {submission_file_path}`")
+
+        return job_id
+
+    def _make_submission_file_text(self, command_groups: List[List[str]]) -> str:
+        """
+        The submission file will be responsible for the following
+        - Handle sbatch config (implemented in upstream)
+        - Handle env variables (implemented in upstream)
+        - Handle storing distribution information which will be consumed by train_script.sh
+        - Call train_script.sh with proper srun command
+        """
+        origin_sbatch_str = super()._make_submission_file_text(command_groups)
+        origin_sbatch_str = origin_sbatch_str.split("\n")
+        assert origin_sbatch_str[0] == "#!/bin/bash", origin_sbatch_str[0]
+        command_idx = None
+        for idx, sbatch_str in enumerate(origin_sbatch_str):
+            if sbatch_str.startswith("# command"):
+                command_idx = idx
+                break
+        assert command_idx is not None, f"Can not find command in the submission file str: {origin_sbatch_str}"
+        distributed_strs = [
+            "",
+            "# Prepare distributed files",
+            f'srun -l bash -c "scontrol show hostnames | sort > {self.hostfile}"',
+            "",
+        ]
+        if self.launch_docker_container_text is None:
+            updated_sbatch_str = origin_sbatch_str[:command_idx] + distributed_strs + origin_sbatch_str[command_idx:]
+        else:
+            updated_sbatch_str = origin_sbatch_str[:command_idx] + distributed_strs + command_groups[0]
+
+        return "\n".join(updated_sbatch_str)
diff --git a/launcher/nemo/stages.py b/launcher/nemo/stages.py
new file mode 100755
index 0000000..bccb8aa
--- /dev/null
+++ b/launcher/nemo/stages.py
@@ -0,0 +1,935 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+# Portions taken from https://github.com/NVIDIA/NeMo-Framework-Launcher, Copyright Nvidia Corporation
+
+
+import logging
+import shutil
+from pathlib import Path
+from typing import Dict, List
+
+import omegaconf
+from nemo_launcher.core.stages import Training, _hydra_interpolation
+from nemo_launcher.utils.job_utils import JobPaths
+from omegaconf import OmegaConf
+
+from ..accelerator_devices import get_num_accelerator_devices
+from ..efa import (
+    efa_supported_instance,
+    instanceWithMultipleEFAs,
+    instanceWithRDMASupport,
+)
+from ..telemetry import Telemetry
+from .constants import ROOT_DIR
+from .launchers import SMAutoLauncher
+
+logger = logging.getLogger(__name__)
+
+# Predefined distributed args for torchrun
+PROCESSES_PER_NODE = "PROCESSES_PER_NODE"
+NNODES = "NNODES"
+NODEID = "NODEID"
+MASTER_ADDR = "MASTER_ADDR"
+MASTER_PORT = "MASTER_PORT"
+DISTRIBUTED_ARGS = "DISTRIBUTED_ARGS"
+CONTAINER_NAME = "sm_training_launcher"
+TRANSFORMERS_VERSION_FOR_MULTIMODAL = "4.45.2"
+
+
+def set_multinode_envs(env_vars, instance_type):
+    # https://github.com/aws/aws-ofi-nccl/blob/master/doc/efa-env-var.md
+    if get_num_efa_devices(instance_type) > 0:
+        env_vars["FI_PROVIDER"] = "efa"
+    env_vars["NCCL_SOCKET_IFNAME"] = "^lo,docker0"
+    env_vars["NCCL_IGNORE_DISABLED_P2P"] = "1"
+    env_vars["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
+    env_vars["TORCH_DIST_INIT_BARRIER"] = "1"
+    env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    return env_vars
+
+
+def allow_rdma(instance_type):
+    return instance_type in instanceWithRDMASupport
+
+
+def get_instance_type(cfg):
+    instance_type = None
+
+    if cfg.get("instance_type"):
+        instance_type = cfg.instance_type
+    else:
+        # custom path
+        instance_type = cfg.cluster.instance_type
+
+    assert instance_type is not None, "instance type is required from config"
+
+    if instance_type.startswith("ml."):
+        instance_type = instance_type[3:]
+
+    return instance_type.lower()
+
+
+def get_num_efa_devices(instance_type):
+    # If not a EFA instance, return 0
+    if instance_type not in efa_supported_instance:
+        return 0
+    # If multi-EFA, return from mapping
+    if instance_type in instanceWithMultipleEFAs:
+        return instanceWithMultipleEFAs[instance_type]
+    # Only a single EFA device
+    return 1
+
+
+def get_ntasks_per_node(stage_cfg):
+    """
+    Get the number of processes per node used for training
+    When running with custom script it will be stage_cfg.run.ntasks_per_node
+    """
+    ntasks = OmegaConf.select(stage_cfg, "run.ntasks_per_node")
+    if ntasks is None:
+        ntasks = stage_cfg.get("trainer").get("devices")
+    return ntasks
+
+
+def get_num_nodes(stage_cfg):
+    """
+    Get the number of nodes used for training
+    When running with custom script it will be stage_cfg.run.nodes
+    """
+    run_cfg = stage_cfg.get("run")
+    nodes = run_cfg.get("nodes")
+    if nodes is None:
+        nodes = stage_cfg.get("trainer").get("num_nodes")
+    return nodes
+
+
+def get_container_type(container):
+    if container is None:
+        return None
+    if container.endswith(".sqsh"):
+        return "enroot"
+    return "docker"
+
+
+def convert_dict_to_command_line_args(key_values):
+    command = " ".joi
+    for key, value in key_values:
+        command += f"{key}=value "
+
+
+class SMTraining(Training):
+    """
+    Base stage class for doing training on Sagemaker
+    """
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        # Use GPU device for default flow for NeMo runs
+        self.device = "gpu"
+        self.instance_type = get_instance_type(cfg)
+        self.num_efa_devices = get_num_efa_devices(self.instance_type)
+        self.telemetry = Telemetry()
+
+    @property
+    def _default_repo(self):
+        # Default repo to mount script from
+        return None
+
+    @property
+    def _default_branch(self):
+        # Default repo branch to mount script from
+        return None
+
+    def _make_torchrun_string(self):
+        """
+        Create torchrun string based on single/multi-node job
+        """
+        ntasks_per_node = get_ntasks_per_node(self.stage_cfg)
+        if int(get_num_nodes(self.stage_cfg)) > 1:
+            return f"torchrun ${DISTRIBUTED_ARGS} "
+        else:
+            return f"torchrun --nproc_per_node {ntasks_per_node} "
+
+    def _make_custom_call_string(self, stage_cfg_path=None) -> str:
+        """
+        Create the training command with torchrun, script and args
+        """
+        script_path = str(self._entry_script_path)
+        torchrun_cmd = self._make_torchrun_string()
+        script_args_str = self.get_script_args_str(stage_cfg_path)
+        command = [torchrun_cmd, script_path, script_args_str]
+        command_string = " \\\n  ".join(command)
+        return command_string
+
+    def _get_hostfile_location(self):
+        """
+        Get the file location to store the hostnames
+        """
+        job_path = self.get_job_path()
+        hostfile_location = Path(job_path.folder / "hostname")
+        return hostfile_location
+
+    def _use_local_repo(self) -> bool:
+        repo_url_or_path = None
+        if OmegaConf.select(self.cfg, "git.repo_url_or_path"):
+            repo_url_or_path = self.cfg.git.repo_url_or_path
+        return repo_url_or_path is not None and not (
+            repo_url_or_path.startswith("http") or repo_url_or_path.startswith("codecommit::")
+        )
+
+    def _make_docker_exec_script_text(self, stage_cfg_path):
+        docker_exec_script_text = ["#!/bin/bash", "set -ex"]
+
+        docker_exec_script_text.append("")
+        docker_exec_script_text.append("function job_epilogue {")
+        docker_exec_script_text.append(
+            "  docker ps -a --filter 'name="
+            + CONTAINER_NAME
+            + "' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true"
+        )
+        docker_exec_script_text.append("}")
+        docker_exec_script_text.append("trap job_epilogue EXIT SIGTERM SIGINT")
+
+        docker_exec_script_text.append("")
+        docker_exec_script_text.append(f"docker exec {CONTAINER_NAME} bash {stage_cfg_path.parents[0]}/train_script.sh")
+
+        docker_exec_script_text.append("")
+        docker_exec_script_text.append("exit 0")
+
+        return "\n".join(docker_exec_script_text)
+
+    def _make_launch_docker_container_text(self):
+        """
+        Creating a script to launch container on all nodes
+        This will be called only when running docker container on Slurm cluster
+        """
+        launch_docker_container_text = ["#!/bin/bash", "set -ex"]
+        image = self.cfg.container
+
+        # Login ECR
+        launch_docker_container_text.append(f'echo "image is {image}"')
+        is_ecr_image = "amazonaws.com" in image
+        if not is_ecr_image:
+            launch_docker_container_text.append(f'echo "Not an ECR image, skipping ECR login"')
+        else:
+            # format will be account.dkr.ecr.region.amazonaws.com/repo:tag
+            link = image.split("/")[0]
+            region = link.split(".")[3]
+            launch_docker_container_text.append(f"# Login ECR")
+            launch_docker_container_text.append(
+                f"aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {link}"
+            )
+            launch_docker_container_text.append("")
+
+        # Handle EFA devices
+        if get_num_efa_devices(self.instance_type) > 0:
+            launch_docker_container_text.append(f"# Getting EFA devices")
+            if allow_rdma(self.instance_type):
+                launch_docker_container_text.append('device=("--device=/dev/gdrdrv")')
+            else:
+                launch_docker_container_text.append("device=()")
+            launch_docker_container_text.extend(
+                [
+                    "while IFS= read -r -d '' d; do",
+                    '  device+=("--device=${d}")',
+                    'done < <(find "/dev/infiniband" -name "uverbs*" -print0)',
+                ]
+            )
+            launch_docker_container_text.append("")
+
+        # Clean old containers
+        launch_docker_container_text.append(f"# Clean old containers")
+        launch_docker_container_text.append(
+            "docker ps -a --filter 'name="
+            + CONTAINER_NAME
+            + "' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true"
+        )
+        launch_docker_container_text.append(
+            "docker ps -a --filter 'name=" + CONTAINER_NAME + "' --format '{{.ID}}' | xargs -I{} docker wait {} || true"
+        )
+        launch_docker_container_text.append("")
+
+        # Pull new container
+        launch_docker_container_text.append(f'docker pull "{image}"')
+
+        # Docker run command
+        launch_docker_container_text.extend(
+            [
+                f"docker run --gpus {get_ntasks_per_node(self.stage_cfg)} \\",
+                f'  --privileged --rm -d --name "{CONTAINER_NAME}" \\',
+                "  --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \\",
+                "  --security-opt seccomp=unconfined  \\",
+            ]
+        )
+
+        if get_num_efa_devices(self.instance_type) > 0:
+            launch_docker_container_text.append('  "${device[@]}" \\')
+
+        # Handle volume mounting
+        mount_str = self._make_container_mounts_string()
+        for mount in mount_str.split(","):
+            launch_docker_container_text.append(f"  -v {mount} \\")
+
+        # Handle user run args and post run commands
+        post_launch_commands = []
+        if OmegaConf.select(self.cfg, "cluster.slurm_docker_cfg", default=None) is not None:
+            if self.cfg.cluster.slurm_docker_cfg.get("docker_args", None) is not None:
+                user_arg = []
+                for arg in self.cfg.cluster.slurm_docker_cfg.docker_args:
+                    user_arg.append(arg)
+                if len(user_arg) > 0:
+                    user_arg = " ".join(user_arg)
+                    launch_docker_container_text.append(f"  {user_arg} \\")
+            if self.cfg.cluster.slurm_docker_cfg.get("post_launch_commands", None) is not None:
+                for cmd in self.cfg.cluster.slurm_docker_cfg.post_launch_commands:
+                    post_launch_commands.append(cmd)
+            if self.cfg.recipes.get("model", None) and self.cfg.recipes.model.get("multi_modal", False):
+                transformers_upgrade_cmd = "pip install transformers==4.45.2"
+                post_launch_commands.append(transformers_upgrade_cmd)
+
+        launch_docker_container_text.append(f'  "{image}" sleep infinity')
+        launch_docker_container_text.append("")
+
+        # Allow containers to talk to each other
+        launch_docker_container_text.append(f"# Running post launching commands")
+        launch_docker_container_text.extend(
+            [
+                f'docker exec -itd "{CONTAINER_NAME}" bash -c "printf \\"Port 2022\\n\\" >> /etc/ssh/sshd_config"',
+                f'docker exec -itd "{CONTAINER_NAME}" bash -c "printf \\"  Port 2022\\n\\" >> /root/.ssh/config"',
+                f'docker exec -itd "{CONTAINER_NAME}" bash -c "service ssh start"',
+            ]
+        )
+        for cmd in post_launch_commands:
+            launch_docker_container_text.append(f'docker exec "{CONTAINER_NAME}" bash -c "{cmd}"')
+        launch_docker_container_text.append("")
+
+        # Exit
+        launch_docker_container_text.append("exit 0")
+
+        return "\n".join(launch_docker_container_text)
+
+    def _make_train_script_text(self, stage_cfg_path=None, port=41000) -> str:
+        """
+        The custom train entry script, it will be responsible for following
+        - Handle resolving hostname and create torch distribtued args
+        - Pull from github if required
+        - Launch torchrun command
+        """
+        nodes = get_num_nodes(self.stage_cfg)
+        ntasks_per_node = get_ntasks_per_node(self.stage_cfg)
+        script_text = ["#!/bin/bash", "set -ex"]
+
+        # Also export env vars here so that they can be consumed by docker container
+        env_vars = self.get_env_vars()
+        if env_vars:
+            script_text.extend([f"export {k}={v}" for k, v in env_vars.items()])
+
+        # Prepare for the host information to create the torchrun command
+        if nodes > 1:
+            script_text.extend(
+                [
+                    f"{MASTER_ADDR}=$(head -n 1 {str(self._get_hostfile_location())})",
+                    f'{NODEID}=$(($(grep -nx -o "\\b$(hostname)\\b" {str(self._get_hostfile_location())} | cut -d ":" -f 1) - 1))',
+                    f"{NNODES}={nodes}",
+                    f"{PROCESSES_PER_NODE}={ntasks_per_node}",
+                    f"{MASTER_PORT}={port}",
+                    "",
+                ]
+            )
+            if self.device == "trainium":
+                script_text.append(
+                    f'{DISTRIBUTED_ARGS}="--nproc_per_node ${PROCESSES_PER_NODE} --nnodes ${NNODES} --node_rank ${NODEID} --master_addr ${MASTER_ADDR} --master_port ${MASTER_PORT}"'
+                )
+            else:
+                script_text.append(
+                    f'{DISTRIBUTED_ARGS}="--nproc_per_node ${PROCESSES_PER_NODE} --nnodes ${NNODES} --rdzv_endpoint=${MASTER_ADDR} --rdzv_id=100 --rdzv_backend=c10d"'
+                )
+        else:
+            script_text.append(f'{DISTRIBUTED_ARGS}="--nproc_per_node {ntasks_per_node}"')
+
+        # Prepare github pull
+        # Aligns with the train-script preparation in launcher/nemo/k8s_templates/training.yaml
+        script_text.append("")
+        if self.cfg.get("git", None) is not None or self._default_repo is not None:
+            repo_url_or_path = self._default_repo
+            branch = self._default_branch
+            if self.cfg.get("git", None) is not None:
+                if self.cfg.git.get("repo_url_or_path", None) is not None:
+                    repo_url_or_path = str(self.cfg.git.get("repo_url_or_path"))
+                assert repo_url_or_path is not None, "`repo_url_or_path` must be defined when setting git config"
+                if self.cfg.git.get("token", None) is not None:
+                    repo_url_or_path = self.insert_git_token(repo_url_or_path, self.cfg.git.token)
+                if self.cfg.git.get("branch", None) is not None:
+                    branch = self.cfg.git.branch
+
+            if not self._use_local_repo():
+                # Remote repo, clone the repo url
+                script_text.extend(
+                    [
+                        "# For greater env stability, grab hostname from `hostname`",
+                        "# https://sim.amazon.com/issues/P162624109",
+                        'LAUNCHER_HOSTNAME="$(hostname)"',
+                        "",
+                        "mkdir -p $HOME/tmp",
+                        'GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"',
+                        "[[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR",
+                        f"git clone {repo_url_or_path} $GIT_CLONE_DIR",
+                        "GIT_CLONE_DIR=${GIT_CLONE_DIR}/",
+                        "cd $GIT_CLONE_DIR",
+                    ]
+                )
+            else:
+                # simply cd to the directory for local repo
+                script_text.append(f"cd {repo_url_or_path}")
+
+            if branch is not None:
+                script_text.append(f"git checkout {branch}")
+            if self.cfg.get("git", None) is not None and self.cfg.git.get("commit", None) is not None:
+                script_text.append(f"git fetch origin {self.cfg.git.commit}")
+                script_text.append(f"git reset --hard {self.cfg.git.commit}")
+        else:
+            script_text.append('GIT_CLONE_DIR=""')
+
+        if not OmegaConf.select(self.cfg, "training.run.model_type", default="").startswith("neuron"):
+            script_text.append("")
+            script_text.append("unset SLURM_NTASKS")
+
+        script_text.append("")
+        script_text.append(self._make_custom_call_string(stage_cfg_path))
+        return "\n".join(script_text)
+
+    @staticmethod
+    def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths, cfg: OmegaConf) -> Path:
+        """
+        Overriding from Training.save_stage_hydra_config, remove the addition of extra keys in k8s case
+        Interpolate and save hydra config file for current stage
+
+        :param OmegaConf stage_cfg: current stage's hydra configuration
+        :param JobPaths job_path: JobPaths object
+        :param OmegaConf cfg: base config for job
+        :return: path current stage's essential nemo scripts code
+        :rtype: Path
+        """
+
+        _hydra_interpolation(stage_cfg)
+
+        cfg_save_path = job_path.config_file
+        omegaconf.OmegaConf.save(stage_cfg, cfg_save_path)
+        return cfg_save_path
+
+    def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
+        """
+        Custom run stage which will invoke the entry script only
+        [TODO] Make this compatiable with NeMo flow as well
+        """
+        if get_container_type(self.cfg.get("container", None)) == "docker":
+            logger.warning(
+                f"![WARNING] You're using docker container directly for slurm workload, we'd highly recommend using enroot instead"
+            )
+            command_groups = [
+                [
+                    # Launch container first
+                    f"srun -l bash {stage_cfg_path.parents[0]}/launch_docker_container.sh",
+                    f"srun -l bash {stage_cfg_path.parents[0]}/docker_exec_script.sh",
+                ]
+            ]
+        # There will be only a single command group
+        # enroot or conda/venv, no need to launch docker container
+        else:
+            command_groups = [[f"bash {stage_cfg_path.parents[0]}/train_script.sh"]]
+
+        return command_groups
+
+    def create_sm_jobs_script(self, job_folder):
+        full_recipe_path = Path(job_folder) / "recipe.yaml"
+        OmegaConf.save(config=self.cfg.get("training"), f=full_recipe_path)
+        sm_jobs_config_path = Path(job_folder) / "sm_jobs_config.yaml"
+        OmegaConf.save(config=self.cfg.cluster.get("sm_jobs_config"), f=sm_jobs_config_path)
+        script_src = Path(ROOT_DIR) / "template" / "sm_jobs.py"
+        script_dst = Path(job_folder) / "launch.py"
+        shutil.copy(script_src, script_dst)
+        # FIXME: Remove transformers requirement when container is updated to include the version
+        # required to run multi-modal.
+        if self.cfg.recipes.get("model", None) and self.cfg.recipes.model.get("multi_modal", False):
+            reqs_filename = Path(job_folder) / "requirements.txt"
+            with open(reqs_filename, "w") as reqs_file:
+                reqs_file.write(f"transformers=={TRANSFORMERS_VERSION_FOR_MULTIMODAL}")
+
+    def make_sm_jobs_command(self):
+        """
+        Make submit command for sm_jobs cluster type.
+        """
+        instance_type = self.cfg.get("instance_type")
+        if instance_type is None:
+            raise ValueError("Expected instance_type to be set with sm_jobs cluster type")
+        sm_jobs_config = self.cfg.cluster.get("sm_jobs_config")
+        if sm_jobs_config is None:
+            raise ValueError("Expected sm_jobs_config to be set with sm_jobs cluster type")
+        if sm_jobs_config.get("output_path") is None:
+            raise ValueError("Expected output_path to be set with sm_jobs cluster type")
+        command = f"python launch.py --job_name {self.job_name} --instance_type {instance_type}"
+        command_groups = [["pushd $(dirname -- $0)", command, "popd"]]
+        return command_groups
+
+    def run(self) -> str:
+        """
+        Run current stage
+        """
+        # Setup folders and datasets
+        self.setup_folder_and_data()
+        # Save stage hydra config
+        job_path = self.get_job_path()
+        # Identify if launching a trainium job
+        is_trainium = self.__class__.__name__ == "SMTrainingTrainiumRecipe"
+
+        is_custom = self.cfg.get("training_cfg") is not None
+        if not is_custom:
+            stage_cfg_path = SMTraining.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg)
+        else:
+            stage_cfg_path = job_path.config_file
+
+        if self.cluster == "sm_jobs":
+            if is_custom:
+                raise RuntimeError("SM jobs launcher is not supported with custom training.")
+            cluster_parameters = {"job_name": self.job_name}
+            self.create_sm_jobs_script(job_path.folder)
+            command_groups = self.make_sm_jobs_command()
+        else:
+            # Make cluster parameters
+            cluster_parameters = self._make_cluster_parameters(self.cluster)
+
+            cluster_parameters["train_script_text"] = self._make_train_script_text(stage_cfg_path)
+            if get_container_type(self.cfg.container) == "docker":
+                cluster_parameters["launch_docker_container_text"] = self._make_launch_docker_container_text()
+                cluster_parameters["docker_exec_script_text"] = self._make_docker_exec_script_text(stage_cfg_path)
+            if get_container_type(self.cfg.container) != "enroot":
+                cluster_parameters.pop("container_mounts", None)
+            # if self.cfg.get("slurm_create_submission_file_only", None) is not None:
+            #     cluster_parameters["slurm_create_submission_file_only"] = self.cfg.slurm_create_submission_file_only
+            cluster_parameters["hostfile"] = self._get_hostfile_location()
+
+            if is_trainium and self.get_cluster_type() == "bcm":
+                # Save temp training config file with string interpolations resolved so it can be
+                # copied into Neuron's package by the compute node(s) eventually selected by Slurm.
+                # NOTE: This file can't be removed. Multiple nodes may run the job asynchronously
+                # so there aren't any order guarantees nor an ideal moment to remove the file.
+                OmegaConf.save(self.cfg.training, self._temp_training_conf_file, True)
+
+            # Make k8s config file if necessary
+            if self.cluster == "k8s":
+                # The following two methods are overrides from the Training class. They require
+                # `template_root` but in our implementation we re-define it inside those methods.
+                # Therefore, `template_root` is just a sentinel so parent behavior is not broken.
+                sentinel_template_root = ""
+                self._make_k8s_spec_file(sentinel_template_root, cluster_parameters, job_path, stage_cfg_path)
+                self._copy_k8s_helm_chart(sentinel_template_root, job_path)
+
+                # k8s does not need command groups
+                command_groups = None
+            else:
+                command_groups = self.make_stage_command_groups(stage_cfg_path)
+
+        launcher = SMAutoLauncher(
+            folder=job_path.folder,
+            cluster=self.cluster,
+            **cluster_parameters,
+        )
+        job_id = launcher.launch(command_groups=command_groups)
+
+        if self.cluster == "bcm":
+            try:
+                self.telemetry.start(
+                    self.cluster,
+                    self.instance_type,
+                    get_num_nodes(self.stage_cfg),
+                    job_id=job_id,
+                    container=self.cfg.get("container", None),
+                )
+            except:
+                pass
+
+        return job_id
+
+    def get_cluster_type(self) -> str:
+        """
+        Get cluster type depending on whether configuration is custom or recipe
+        """
+        # custom configurations have the `training_cfg` key
+        is_custom = self.cfg.get("training_cfg") is not None
+
+        cluster_type = None
+
+        if is_custom:
+            cluster_type = OmegaConf.select(self.cfg, "cluster.cluster_type")
+        else:
+            cluster_type = self.cfg.get("cluster_type")
+
+        if cluster_type is None:
+            raise AttributeError("`cluster_type` is not defined in the configuration file")
+
+        return cluster_type
+
+    def get_script_args_str(self, stage_cfg_path: Path) -> str:
+        """
+        Based on https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L608
+        """
+        if self.cluster == "k8s":
+            return "--config-path=/config --config-name=config.yaml"
+        return f"--config-path={stage_cfg_path.parents[0]} --config-name={stage_cfg_path.name}"
+
+    def insert_git_token(self, repo_url_or_path: str, token: str) -> str:
+        """
+        Insert git token to git repo url. Currently only support github repo
+        """
+        if "github.com" in repo_url_or_path:
+            splitted_url = repo_url_or_path.split("github.com", 1)
+            repo_url_or_path = splitted_url[0] + self.cfg.git.token + "@github.com" + splitted_url[1]
+        return repo_url_or_path
+
+    def _make_nemo_path_command(self) -> List[str]:
+        """Extend nemo path to python path"""
+        # [TODO] clone the nemo/SFA/NxTT repo and handle point to the right path
+        return super()._make_nemo_path_command()
+
+    def _make_container_mounts_string(self) -> str:
+        """
+        Make container mounting string based on hydra configurations
+
+        :return: container mounting string, e.g. "/path/to/A:/path/to/A,/path/to/B:/path/to/B,..."
+        :rtype: str
+        """
+
+        def add_container_mounts(container_mounts):
+            mounts_str = ""
+            if container_mounts is not None:
+                assert isinstance(container_mounts, omegaconf.listconfig.ListConfig), "container_mounts must be a list."
+                for mount in container_mounts:
+                    if mount is not None and isinstance(mount, str):
+                        mounts_str += f",{mount}" if ":" in mount else f",{mount}:{mount}"
+            return mounts_str
+
+        cfg = self.cfg
+        base_results_dir = cfg.get("base_results_dir")
+        mounts_string = (
+            f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{base_results_dir}:{base_results_dir}"
+        )
+
+        # mount volume only if inside a Hyperpod environment
+        hp_logs_dir = "/var/log/aws/clusters"
+        if Path(hp_logs_dir).is_dir():
+            mounts_string += f",{hp_logs_dir}:{hp_logs_dir}"
+
+        """Start of SM change"""
+        container_mounts = cfg.cluster.get("container_mounts")
+        """End of SM change"""
+
+        mounts_string += add_container_mounts(container_mounts)
+
+        # https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L264
+        # We do not have data dir for custom launching
+        mounts_string = mounts_string.replace(",None:None", "")
+        if self._use_local_repo():
+            mounts_string += f",{self.cfg.git.repo_url_or_path}:{self.cfg.git.repo_url_or_path}"
+        return mounts_string
+
+    def generate_default_k8s_value_template(self, template_root, cluster_parameters, stage_cfg_path=None):
+        """
+        Setting the general k8s configs that will be applicable for all device types and training methods
+        """
+        with open(template_root / "values.yaml") as value_file:
+            values_template = OmegaConf.load(value_file)
+
+        values_template.image.trainingImage = cluster_parameters["container_image"]
+        values_template.trainingConfig.jobName = self.stage_cfg.run.name
+
+        # Cluster configs
+        values_template.trainingConfig.numEFADevices = self.num_efa_devices
+        if "pullPolicy" in cluster_parameters:
+            values_template.image.pullPolicy = cluster_parameters["pullPolicy"]
+        if "env_vars" in cluster_parameters:
+            values_template.trainingConfig.envVars = cluster_parameters["env_vars"]
+        if "restartPolicy" in cluster_parameters:
+            values_template.trainingConfig.restartPolicy = cluster_parameters["restartPolicy"]
+        if "persistent_volume_claims" in cluster_parameters:
+            values_template.trainingConfig.persistentVolumeClaims = cluster_parameters["persistent_volume_claims"]
+        if "volumes" in cluster_parameters:
+            values_template.trainingConfig.volumes = cluster_parameters["volumes"]
+        if cluster_parameters.get("namespace", None) is not None:
+            values_template.trainingConfig.namespace = cluster_parameters["namespace"]
+        if cluster_parameters.get("annotations", None) is not None:
+            values_template.trainingConfig.annotations = cluster_parameters["annotations"]
+        if cluster_parameters.get("priority_class_name", None) is not None:
+            values_template.trainingConfig.priorityClassName = cluster_parameters["priority_class_name"]
+        if cluster_parameters.get("service_account_name") is not None:
+            values_template.trainingConfig.serviceAccountName = cluster_parameters["service_account_name"]
+        if cluster_parameters.get("custom_labels", None) is not None:
+            values_template.trainingConfig.customLabels = cluster_parameters["custom_labels"]
+        if cluster_parameters.get("label_selector", None) is not None:
+            values_template.trainingConfig.labelSelector = cluster_parameters["label_selector"]
+        values_template.trainingConfig.compile = OmegaConf.select(self.cfg, "recipes.run.compile", default=0)
+        if self._default_repo is not None:
+            values_template.trainingConfig.git.repo_url_or_path = self._default_repo
+        if self._default_branch is not None:
+            values_template.trainingConfig.git.branch = self._default_branch
+
+        # Git configs
+        if self.cfg.get("git", None) is not None:
+            if self.cfg.git.get("repo_url_or_path", None) is not None:
+                repo_url_or_path = str(self.cfg.git.repo_url_or_path)
+                # We only support to use local repo path for slurm, bcm is nemo launcher version of slurm cluster
+                if not (repo_url_or_path.startswith("http") or repo_url_or_path.startswith("codecommit::")):
+                    raise ValueError("local git repo path is only supported for slurm based cluster")
+                if self.cfg.git.get("token", None) is not None:
+                    repo_url_or_path = self.insert_git_token(repo_url_or_path, self.cfg.git.token)
+
+                values_template.trainingConfig.git.repo_url_or_path = repo_url_or_path
+            if self.cfg.git.get("branch", None) is not None:
+                values_template.trainingConfig.git.branch = self.cfg.git.branch
+            if self.cfg.git.get("commit", None) is not None:
+                values_template.trainingConfig.git.commit = self.cfg.git.commit
+
+        values_template.trainingConfig.device = self.device
+        values_template.trainingConfig.scriptArgs = self.get_script_args_str(stage_cfg_path)
+        return values_template
+
+    def write_value_template(self, values_template, job_path):
+        """
+        Write the value template into disk
+        """
+        k8s_template_path = job_path.folder
+        k8s_template_file = Path(k8s_template_path / "k8s_template" / "values.yaml")
+        k8s_template_file.parent.mkdir(parents=True, exist_ok=True)
+
+        conf = OmegaConf.create(values_template)
+        OmegaConf.save(conf, k8s_template_file)
+
+    def update_stage_specific_k8s_values(self, values_template):
+        """
+        Update the k8s configs that is related to the current stage
+        """
+        values_template.trainingConfig.ntasksPerNode = self.stage_cfg.trainer.devices
+        values_template.trainingConfig.nodes = self.stage_cfg.trainer.num_nodes
+        choice_model_type, _ = self.get_stage_config_choice()
+        if self.cfg.git.get("entry_script", None) is not None:
+            # Override with entry script provided by the customer
+            values_template.trainingConfig.scriptPath = self.cfg.git.entry_script
+        else:
+            values_template.trainingConfig.scriptPath = str(self._entry_script_path)
+
+        if self.cfg.recipes.get("model", None) and self.cfg.recipes.model.get("multi_modal", False):
+            transformers_upgrade_cmd = "pip install transformers==4.45.2"
+            values_template.trainingConfig.pre_script.append(transformers_upgrade_cmd)
+
+        return values_template
+
+    # @override - available in Python 3.12 - `template_root` is required by parent implementation
+    def _make_k8s_spec_file(
+        self, template_root: str, cluster_parameters: Dict, job_path: JobPaths, stage_cfg_path=None
+    ):
+        """
+        Referring from https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L669
+        Break the function into 3 parts so we can easily override in different stages
+        - Create general k8s configs that will be applicable for all device types and training methods
+        - Update stage specific k8s configs
+        - Write k8s configs to disk as value.yaml, which will be consumed by helm
+        """
+        #  Need to override the template_root to use our templates
+        # [TODO] Currently hard-code it to do the stage as training
+        template_root: Path = ROOT_DIR / "launcher/nemo/k8s_templates/training"
+        values_template = self.generate_default_k8s_value_template(template_root, cluster_parameters, stage_cfg_path)
+        values_template = self.update_stage_specific_k8s_values(values_template)
+        self.write_value_template(values_template, job_path)
+
+    def _copy_k8s_helm_helper_configs(self, src_training_dir: Path, job_path: JobPaths):
+        """
+        Copy helper Helm files into results directory
+        """
+        # copy the Trainium and GPU config files
+        gpu_config = "train-script-gpu.yaml"
+        trn_config = "train-script-trn.yaml"
+        templates_path = Path(job_path.folder / "k8s_template" / "templates")
+        shutil.copy2(str(src_training_dir / gpu_config), str(templates_path / gpu_config))
+        shutil.copy2(str(src_training_dir / trn_config), str(templates_path / trn_config))
+
+    # @override - available in Python 3.12 - `template_root` is required by parent implementation
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        #  Need to override the template_root to use our templates
+        # [TODO] Currently hard-code it to do the stage as training
+        src_training_dir = ROOT_DIR / "launcher/nemo/k8s_templates/training"
+        super()._copy_k8s_helm_chart(str(src_training_dir), job_path)
+        self._copy_k8s_helm_helper_configs(src_training_dir, job_path)
+
+    def get_env_vars(self) -> Dict:
+        """
+        Set up dictionary for environment variables
+        By default injecting the EFA env variable when doing multi-node training
+        The environment variables from hydra config will be set inside the job scripts.
+        For Example:
+            Set `env_vars.NVTE_BIAS_DROPOUT_FUSION=1` while calling nemo_launcherlauncher-scripts,
+            `NVTE_BIAS_DROPOUT_FUSION=1` will be set while running the job.
+
+        :return: a dictionary of env vars while running the job.
+        :rtype: Dict
+        """
+        env_vars = super().get_env_vars()
+        stage_cfg = self.stage_cfg
+        nodes = get_num_nodes(stage_cfg)
+        if int(nodes) > 1:
+            env_vars = set_multinode_envs(env_vars, self.instance_type)
+        return env_vars
+
+
+class SMCustomTraining(SMTraining):
+    """
+    Base stage for the custom training on Sagemaker.
+    """
+
+    @property
+    def _entry_script_path(self) -> Path:
+        return Path(self.stage_cfg.entry_script)
+
+    def setup_stage_vars(self, cfg):
+        """Setup the stage vars, i.e. stage name and stage cfg"""
+        self.stage_name = "custom_training_sm"
+        self.stage_cfg = cfg.get("training_cfg")
+
+    def get_script_args_str(self, stage_cfg_path=None):
+        """
+        Getting all script args and make it as a str
+        """
+        arg_str = []
+        if self.stage_cfg.get("script_args", None) is not None:
+            # script_args will be a list of dict which has key of arg_name and value of arg_value
+            for arg in list(self.stage_cfg.script_args):
+                for key, val in arg.items():
+                    arg_str.append(f"{key} {val} ")
+        return "".join(arg_str)
+
+    def update_stage_specific_k8s_values(self, values_template):
+        """
+        Custom training specifc k8s values
+        """
+        values_template.trainingConfig.ntasksPerNode = get_ntasks_per_node(self.stage_cfg)
+        values_template.trainingConfig.nodes = get_num_nodes(self.stage_cfg)
+        values_template.trainingConfig.scriptPath = self.stage_cfg.entry_script
+        values_template.trainingConfig.customScript = True
+        return values_template
+
+    def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths):
+        #  Need to override the template_root to use our templates
+        # [TODO] Currently hard-code it to do the stage as training
+        src_training_dir = ROOT_DIR / "launcher/nemo/k8s_templates/training"
+
+        # For custom run, there is no need for training config files
+        # Only creating training.yaml, Chart.yaml
+        template_file = str(src_training_dir / "training.yaml")
+        chart_file = str(src_training_dir / "Chart.yaml")
+        training_path = Path(job_path.folder / "k8s_template" / "templates" / "training.yaml")
+        training_path.parent.mkdir(parents=True, exist_ok=True)
+        chart_path = Path(job_path.folder / "k8s_template" / "Chart.yaml")
+
+        shutil.copy2(template_file, training_path)
+        shutil.copy2(chart_file, chart_path)
+        self._copy_k8s_helm_helper_configs(src_training_dir, job_path)
+
+    def _make_cluster_parameters(self, cluster: str) -> Dict:
+        """
+        Make a cluster-specific parameters for jobs on different clusters.
+
+        :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc.
+        :return: a dictionary of cluster parameters, e.g. `ntasks_per_node`
+        :rtype: Dict
+        """
+        with omegaconf.open_dict(self.cfg):
+            # Patch self.cfg.cluster to align with
+            # https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L312
+            origin_cluster = self.cfg.cluster
+            self.cfg.cluster = self.cfg.cluster.cluster_config
+            cluster_parameters = super()._make_cluster_parameters(cluster)
+            cluster_type = origin_cluster.get("cluster_type")
+            if cluster_type == "k8s":
+                env_vars = cluster_parameters.get("env_vars")
+                if env_vars and "SLURM_NTASKS_PER_NODE" in env_vars:
+                    env_vars.pop("SLURM_NTASKS_PER_NODE")
+            self.cfg.cluster = origin_cluster
+        return cluster_parameters
+
+
+class SMCustomTrainingGPU(SMCustomTraining):
+    """
+    Stage for training with custom stage on GPU
+    """
+
+    @property
+    def _cuda_visible_devices(self) -> str:
+        ntasks_per_node = get_ntasks_per_node(self.stage_cfg)
+        if ntasks_per_node is None:
+            ntasks_per_node = 8
+        return (
+            "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+            if ntasks_per_node == 8
+            else f"CUDA_VISIBLE_DEVICES={','.join(map(str, range(ntasks_per_node)))}"
+        )
+
+    @property
+    def _set_ln_sm_margin(self) -> str:
+        return ""
+
+    @property
+    def _skip_ag_overlap(self) -> str:
+        return ""
+
+
+class SMCustomTrainingCPU(SMCustomTrainingGPU):
+    """
+    Stage for custom training on CPU
+    """
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.device = "cpu"
+
+    @property
+    def _cuda_visible_devices(self) -> str:
+        return ""
+
+
+class SMCustomTrainingTrainium(SMCustomTraining):
+    """
+    Stage for custom training on Trainium
+    """
+
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        self.device = "trainium"
+
+    def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]:
+        """
+        Make the command groups for current stage
+        Command groups is a list of command group. A command group is defined as:
+              0. Command group is a list of command strings
+              1. Each command group occupies one bcprun, srun or bash
+              2. Each command group eventually has multiple commands connected by ";"
+
+        :param Path stage_cfg_path: path to interpolated and saved configuration
+        :return: command groups for current stage
+        :rtype: List[List[str]]
+        """
+
+    def update_stage_specific_k8s_values(self, values_template):
+        """
+        Custom training specifc k8s values for trainum
+        """
+        super().update_stage_specific_k8s_values(values_template)
+        values_template.trainingConfig.numNeuronDevices = get_num_accelerator_devices(self.instance_type)
+        return values_template
diff --git a/launcher/telemetry.py b/launcher/telemetry.py
new file mode 100644
index 0000000..ebbeb80
--- /dev/null
+++ b/launcher/telemetry.py
@@ -0,0 +1,95 @@
+import json
+import os
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from typing import List
+
+CW_NAME_SPACE = "RecipesTelemetry"
+
+
+@dataclass
+class Metric:
+    Name: str = None
+    Unit: str = None
+
+
+@dataclass
+class MetricDirective:
+    Namespace: str = ""
+    Dimensions: List[List[str]] = None
+    Metrics: List[Metric] = None
+
+
+@dataclass
+class Metadata:
+    CloudWatchMetrics: List[MetricDirective] = field(default_factory=lambda: [MetricDirective])
+    Timestamp: int = None
+
+
+@dataclass
+class CWTelemetryStart:
+    account_id: str = ""
+    training_start_time: int = 0
+    num_nodes: int = 0
+    job_name: str = ""
+    cluster_type: str = ""
+    instance_type: str = ""
+    _aws: Metadata = None
+    job_id: int = 0
+    recipe: str = ""
+    container: str = ""
+
+
+class Telemetry:
+    def __init__(self, log_path="/var/log/aws/clusters/sagemaker-hyperpod-recipes-telemetry.log"):
+        self.log_path = log_path
+
+    def get_account_id(self):
+        import boto3
+
+        client = boto3.client("sts")
+        return client.get_caller_identity()["Account"]
+
+    def publish_cw_log(self, log):
+        save_log = asdict(log)
+        with open(self.log_path, "a") as f:
+            f.write(json.dumps(save_log, separators=(",", ":")) + "\n")
+
+    def start(
+        self,
+        cluster_type=None,
+        instance_type=None,
+        num_nodes=None,
+        job_id=None,
+        container=None,
+    ):
+        if not os.path.exists(self.log_path):
+            return
+        account_id = self.get_account_id()
+        cw_telemetry_start = CWTelemetryStart(account_id=account_id)
+        cw_telemetry_start.training_start_time = int(time.time() * 1000)
+        cw_telemetry_start.num_nodes = int(num_nodes)
+        cw_telemetry_start.cluster_type = cluster_type
+        cw_telemetry_start.instance_type = instance_type
+        cw_telemetry_start.job_id = job_id
+        cw_telemetry_start.container = container
+
+        recipe = ""
+        for arg in sys.argv:
+            if arg.startswith("recipes="):
+                recipe = arg.split("=")[1]
+        cw_telemetry_start.recipe = recipe
+
+        metadata = Metadata(
+            Timestamp=int(time.time() * 1000),
+            CloudWatchMetrics=[
+                MetricDirective(
+                    Namespace=CW_NAME_SPACE,
+                    Dimensions=[[]],
+                    Metrics=[Metric(Name="num_nodes", Unit="Count")],
+                )
+            ],
+        )
+        cw_telemetry_start._aws = metadata
+        self.publish_cw_log(cw_telemetry_start)
diff --git a/launcher_scripts/custom_model/run_falcon.sh b/launcher_scripts/custom_model/run_falcon.sh
new file mode 100755
index 0000000..a8492d0
--- /dev/null
+++ b/launcher_scripts/custom_model/run_falcon.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
+VAL_DIR=${VAL_DIR} # Location of talidation dataset
+
+EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+recipes=training/custom_model/falcon \
+base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+recipes.run.name="hf-falcon" \
+recipes.exp_manager.exp_dir=$EXP_DIR \
+recipes.trainer.num_nodes=4 \
+recipes.model.train_batch_size=2 \
+recipes.model.data.train_dir=$TRAIN_DIR \
+recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/custom_script/README.md b/launcher_scripts/custom_script/README.md
new file mode 100644
index 0000000..8a485d3
--- /dev/null
+++ b/launcher_scripts/custom_script/README.md
@@ -0,0 +1,26 @@
+# Config for running with custom scripts
+Custom config allows user to use launcher to run some custom jobs that does not use our recipe. We use hydra format for the configs, same as our recipes. Please refer to the `config.yaml` as the template, which also aligns with the `config.yaml` in the recipe folder with some extra configs on cluster and custom script.
+## Config fields
+Here are some essential fields that user might want to override during for custom training
+- training_cfg: This field contains most configs about the training runs
+    - entry_script: Path to the entry script of training/fine-tuning. This path can be one in the container mounts.
+    - script_args: The args that will be used to run this script
+    - run: All runtime configs
+        - name: Current run name
+        - nodes: Number of nodes to use
+        - ntasks_per_node: Number of devices to use per node
+        - results_dir: Directories to store the result. It is recommended to keep it as `${base_results_dir}/${.name}` so everything will be in `base_results_dir`
+- cluster: All cluster based configs
+    - cluster_type: Type of the cluster, can be slrum(bcm) or k8s
+    - instance_type: Instance type to use, if null will use default instance type in cluster.
+    - cluster_config: The detailed cluster config, will be different between slrum and k8s. For details please refer to recipe's doc about cluster setup.
+      - namespace: Namespace to launch jobs
+      - custom_labels: k8s labels applied to job and also each pod running the job, see more details about labels in https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+      - annotations: k8s annotations added to the job, see more details in https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
+      - priority_class_name: Kueue scheduler priority class name, see more details in https://kueue.sigs.k8s.io/
+      - label_selector: k8s NodeAffinity functionality. To allow node selection based on required labels or priority scheduling based on preferred labels.
+      - service_account_name: aws eks service account name. To give pods credentials to call aws services.
+      - persistent_volume_claims: specify multiple persistent volume claims to mount job pod.
+The rest of the configs are similar to the recipe configs.
+## Launch
+To launch the job, simply run inside the `SagemakerTrainingLauncher/launcher folder` with command `python main.py --config-path examples/custom_script/ --config-name config` or use your own config folder.
diff --git a/launcher_scripts/custom_script/config_k8s.yaml b/launcher_scripts/custom_script/config_k8s.yaml
new file mode 100755
index 0000000..e317f71
--- /dev/null
+++ b/launcher_scripts/custom_script/config_k8s.yaml
@@ -0,0 +1,98 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+git:
+  repo_url_or_path: null
+  branch: null
+  commit: null
+  token: null
+
+training_cfg:
+
+  entry_script: ??? # Path to the entry script of training/fine-tuning. This path should be inside container or relative path in git repo
+  script_args:
+    - "--some_args" : "debug"
+    - "--some_other_args" : 1
+  run:
+    name: test_custom # Current run name
+    nodes: 8 # Number of nodes to use for current training
+    ntasks_per_node: 8 # Number of devices to use per node
+
+cluster:
+  # Example k8s cluster
+  cluster_type: k8s
+  instance_type: ???
+  cluster_config:
+    namespace: default # the namespace to submit job
+    # create customized labels for the PytorchJob and Pods deployed jobs.
+    # Example:
+    #   custom_labels:
+    #     label-key-1: label-value-1
+    #     label-key-2: label-value-2
+    custom_labels: null
+    # create customized annotations for the jobs.
+    # Example:
+    #   annotations:
+    #     annotation-key-1: annotation-value-1
+    #     annotation-key-2: annotation-value-2
+    annotations: null
+    # add service account to job pods
+    # Example:
+    #  serviceAccountName: service_account
+    service_account_name: null
+    # priorityClassName for Kueue scheduler to decide jobs priority
+    priority_class_name: null
+    # Create k8s NodeAffinity to select nodes to deploy jobs which matches required and preferred labels
+    # Structure:
+    #   label_selector:
+    #     required: <required label key-values pair>
+    #     preferred: <preferred label key-values pair>
+    #     weights: <weights list used by preferred labels to get nodes priority>
+    # Example:
+    #   label_selector:
+    #     required:
+    #       example-label-key:
+    #         - expected-label-value-1
+    #         - expected-label-value-2
+    #     preferred:
+    #       preferred-label-key:
+    #         - preferred-label-value-1
+    #         - preferred-label-value-2
+    #     weights:
+    #       - 100
+    label_selector: null
+    # persistent volume, usually used to mount FSx
+    # Example:
+    # persistent_volume_claims:
+    #       - claimName: null
+    #         mountPath: null
+    #       - claimName: null
+    #         mountPath: null
+    persistent_volume_claims: null
+
+# temp volume: usually used to mount temp directory
+# Example:
+#  volumes:
+#    - volumeName: data1
+#      hostPath: "/data"
+#      mountPath: "/data"
+
+    volumes: null
+
+    pullPolicy: Always # policy to pull container, can be Always, IfNotPresent and Never
+    restartPolicy: Never # restart policy
+
+base_results_dir: ???  # Location to store the results, checkpoints and logs.
+container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
+  - null
+container: ??? # container to use
+
+env_vars:
+  NCCL_DEBUG: DEBUG # Logging level for NCCL. Set to "INFO" for debug information
diff --git a/launcher_scripts/custom_script/config_slurm.yaml b/launcher_scripts/custom_script/config_slurm.yaml
new file mode 100755
index 0000000..1c267c3
--- /dev/null
+++ b/launcher_scripts/custom_script/config_slurm.yaml
@@ -0,0 +1,50 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+git:
+  repo_url_or_path: null
+  branch: null
+  commit: null
+  token: null
+
+training_cfg:
+
+  entry_script: null # Path to the entry script of training/fine-tuning. This path should be inside container or relative path in git repo
+  # script_args:
+  #   - "--some_args" : "debug"
+  #   - "--some_other_args" : 1
+  run:
+    name: test_custom # Current run name
+    nodes: 2 # Number of nodes to use for current training
+    ntasks_per_node: 8 # Number of devices to use per node
+
+cluster:
+  #Example slurm cluster
+
+  cluster_type: slurm
+  instance_type: p5.48xlarge
+  cluster_config:
+    exclusive: True
+    job_name_prefix: testcustom_slurm_
+    slurm_create_submission_file_only: False # Setting to True if just want to create submission file
+    srun_args:
+      # - "--no-container-mount-home"
+
+base_results_dir: null  # Location to store the results, checkpoints and logs.
+container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
+  - null
+container: null # container to use
+slurm_docker_cfg: # Will only be used with docker on slurm
+  docker_args:
+    # - "--runtime=nvidia" # this is required if the docker runtime version is low
+  post_launch_commands: # commands will run after launching the docker container using bash
+
+env_vars:
+  NCCL_DEBUG: DEBUG # Logging level for NCCL. Set to "INFO" for debug information
diff --git a/launcher_scripts/custom_script/custom_allreduce.py b/launcher_scripts/custom_script/custom_allreduce.py
new file mode 100644
index 0000000..7be4ebe
--- /dev/null
+++ b/launcher_scripts/custom_script/custom_allreduce.py
@@ -0,0 +1,11 @@
+import torch
+import torch.distributed as dist
+
+print("init process group")
+dist.init_process_group("nccl")
+print("rank:", dist.get_rank())
+torch.cuda.set_device(dist.get_rank() % 8)
+tensor = torch.randn(4, 4, device="cuda")
+print(f"[{dist.get_rank()}] tensor {tensor}")
+dist.all_reduce(tensor)
+print(f"[{dist.get_rank()}] tensor {tensor} after reduce")
diff --git a/launcher_scripts/custom_script/run_allreduce.sh b/launcher_scripts/custom_script/run_allreduce.sh
new file mode 100755
index 0000000..89d4c44
--- /dev/null
+++ b/launcher_scripts/custom_script/run_allreduce.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
+VAL_DIR=${VAL_DIR} # Location of talidation dataset
+
+EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+--config-path=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/launcher_scripts/custom_script \
+--config-name=config_slurm \
+base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+training_cfg.entry_script=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/laucher_scripts/custom_script/custom_allreduce.py \
+container_mounts=[${SAGEMAKER_TRAINING_LAUNCHER_DIR}] \
+container=<mycontainer>\
diff --git a/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k.sh b/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k.sh
new file mode 100755
index 0000000..b48d0e4
--- /dev/null
+++ b/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=training/llama/p4_hf_llama3_70b_seq8k_gpu \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=32 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
diff --git a/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh b/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh
new file mode 100755
index 0000000..522b4f6
--- /dev/null
+++ b/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=32 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_lora.sh b/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_lora.sh
new file mode 100755
index 0000000..7fb1dae
--- /dev/null
+++ b/launcher_scripts/llama/p4_run_hf_llama3_70b_seq8k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=20 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh b/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh
new file mode 100755
index 0000000..932049b
--- /dev/null
+++ b/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=4 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_lora.sh b/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_lora.sh
new file mode 100755
index 0000000..8b2d464
--- /dev/null
+++ b/launcher_scripts/llama/p4_run_hf_llama3_8b_seq8k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh
new file mode 100755
index 0000000..96f2f93
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
+VAL_DIR=${VAL_DIR} # Location of talidation dataset
+
+EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, etc
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-2-11b" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
diff --git a/launcher_scripts/llama/run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh
new file mode 100755
index 0000000..40abb35
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf_llama3_2_1b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh
new file mode 100755
index 0000000..10e6010
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf_llama3_2_3b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh
new file mode 100644
index 0000000..ea6d527
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR=${TRAIN_DIR} # Location of training dataset
+VAL_DIR=${VAL_DIR} # Location of talidation dataset
+
+EXP_DIR=${EXP_DIR} # Location to save experiment info including logging, checkpoints, etc
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-2-90b" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
diff --git a/launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh b/launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh
new file mode 100755
index 0000000..684c023
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_405b_seq128k_gpu_qlora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-405b-seq131072-qlora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=2 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh
new file mode 100755
index 0000000..5ca851f
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-405b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=6 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh b/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh
new file mode 100755
index 0000000..8314330
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_405b_seq16k_gpu_qlora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-405b-qlora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=2 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh
new file mode 100755
index 0000000..8171a61
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-405b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=6 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh b/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh
new file mode 100755
index 0000000..3fc7d0e
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_405b_seq8k_gpu_qlora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-405b-qlora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=2 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh
new file mode 100755
index 0000000..3b66166
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=16 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh
new file mode 100755
index 0000000..5c39e7a
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=2 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..df4d2b9
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh
new file mode 100755
index 0000000..94c04a3
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq16k_gpu_p5x64_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh
new file mode 100755
index 0000000..5f3bcd7
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=10 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_lora.sh
new file mode 100755
index 0000000..c8f2192
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-70b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh
new file mode 100644
index 0000000..bd9261c
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh
new file mode 100755
index 0000000..e8236df
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_gpu_p5x64_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh
new file mode 100644
index 0000000..793ebfa
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_70b_seq8k_trn1x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+COMPILE="${COMPILE}" # Set to 1 to compile the model, 0 to load a pre-compiled model
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    instance_type="trn1.32xlarge" \
+    recipes=training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain \
+    recipes.run.name="hf-llama3-70b" \
+    recipes.run.compile="$COMPILE" \
+    recipes.trainer.max_steps=50 \
+    recipes.data.train_dir="$TRAIN_DIR" \
+    recipes.model.model_config="$MODEL_CONFIG" \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh
new file mode 100755
index 0000000..f9dc348
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_fine_tuning.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_lora.sh
new file mode 100755
index 0000000..78e5289
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=1 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh
new file mode 100644
index 0000000..8e6c519
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-8b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh
new file mode 100644
index 0000000..b746c40
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq16k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-8b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh
new file mode 100755
index 0000000..9e31b49
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_fine_tuning.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, ect
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-fine-tuning" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=2 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_lora.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_lora.sh
new file mode 100755
index 0000000..8fb06c5
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_lora.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+HF_MODEL_NAME_OR_PATH="${HF_MODEL_NAME_OR_PATH}" # HuggingFace pretrained model name or path
+HF_ACCESS_TOKEN="${HF_ACCESS_TOKEN}" # Optional HuggingFace access token
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    recipes=fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    recipes.run.name="hf-llama3-8b-lora" \
+    recipes.exp_manager.exp_dir="$EXP_DIR" \
+    recipes.trainer.num_nodes=1 \
+    recipes.model.train_batch_size=2 \
+    recipes.model.data.train_dir="$TRAIN_DIR" \
+    recipes.model.data.val_dir="$VAL_DIR" \
+    recipes.model.hf_model_name_or_path="$HF_MODEL_NAME_OR_PATH" \
+    recipes.model.hf_access_token="$HF_ACCESS_TOKEN" \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh
new file mode 100644
index 0000000..59c0d13
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-8b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh
new file mode 100644
index 0000000..2be7900
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-llama3-8b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh
new file mode 100644
index 0000000..e5f48e5
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1_fine_tuning.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+COMPILE="${COMPILE}"
+COMPILER_CACHE_PATH="${COMPILER_CACHE_PATH}"
+TOKENIZER_TYPE="${TOKENIZER_TYPE}"
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+RESUME_FROM_CHECKPOINT_DIR="${RESUME_FROM_CHECKPOINT_DIR}"
+MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    instance_type="trn1.32xlarge" \
+    recipes=training/llama/hf_llama3_8b_seq8k_trn1_fine_tuning \
+    recipes.run.name="hf-llama3-8b-sft" \
+    recipes.run.compile="$COMPILE" \
+    recipes.trainer.max_steps=50 \
+    recipes.compiler_cache_url="$COMPILER_CACHE_PATH" \
+    recipes.data.tokenizer.type="$TOKENIZER_TYPE" \
+    recipes.data.train_dir="$TRAIN_DIR" \
+    recipes.data.val_dir="$VAL_DIR" \
+    recipes.exp_manager.resume_from_checkpoint="$RESUME_FROM_CHECKPOINT_DIR" \
+    recipes.model.model_config="$MODEL_CONFIG" \
diff --git a/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh
new file mode 100644
index 0000000..9b41b7b
--- /dev/null
+++ b/launcher_scripts/llama/run_hf_llama3_8b_seq8k_trn1x4_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+COMPILE=0
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+MODEL_CONFIG="${MODEL_CONFIG}" # Location of config.json for the model
+
+HYDRA_FULL_ERROR=1 python3 "${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py" \
+    base_results_dir="${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results" \
+    instance_type="trn1.32xlarge" \
+    recipes=training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain \
+    recipes.run.name="hf-llama3-8b" \
+    recipes.run.compile="$COMPILE" \
+    recipes.trainer.max_steps=50 \
+    recipes.data.train_dir="$TRAIN_DIR" \
+    recipes.model.model_config="$MODEL_CONFIG" \
diff --git a/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh b/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh
new file mode 100755
index 0000000..1f5647f
--- /dev/null
+++ b/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mistral-7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh b/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..b6e5766
--- /dev/null
+++ b/launcher_scripts/mistral/run_hf_mistral_7b_seq16k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mistral-7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh b/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh
new file mode 100755
index 0000000..e7b5cd5
--- /dev/null
+++ b/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mistral-7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh b/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..31dbaa7
--- /dev/null
+++ b/launcher_scripts/mistral/run_hf_mistral_7b_seq8k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mistral-7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..472abc0
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x22b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh
new file mode 100755
index 0000000..3e092b6
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x22b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..88424e8
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x22b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh
new file mode 100755
index 0000000..22b8368
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x22b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh
new file mode 100755
index 0000000..d5791ec
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..161e79c
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh
new file mode 100755
index 0000000..0e51777
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh
new file mode 100755
index 0000000..5cccab6
--- /dev/null
+++ b/launcher_scripts/mixtral/run_hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+#Users should setup their cluster type in /recipes_collection/config.yaml
+
+SAGEMAKER_TRAINING_LAUNCHER_DIR=${SAGEMAKER_TRAINING_LAUNCHER_DIR:-"$(pwd)"}
+
+TRAIN_DIR="${TRAIN_DIR}" # Location of training dataset
+VAL_DIR="${VAL_DIR}" # Location of validation dataset
+
+EXP_DIR="${EXP_DIR}" # Location to save experiment info including logging, checkpoints, etc.
+
+
+HYDRA_FULL_ERROR=1 python3 ${SAGEMAKER_TRAINING_LAUNCHER_DIR}/main.py \
+    recipes=training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain \
+    base_results_dir=${SAGEMAKER_TRAINING_LAUNCHER_DIR}/results \
+    recipes.run.name="hf-mixtral-8x7b" \
+    recipes.exp_manager.exp_dir=$EXP_DIR \
+    recipes.model.data.train_dir=$TRAIN_DIR \
+    recipes.model.data.val_dir=$VAL_DIR \
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..57eba2c
--- /dev/null
+++ b/main.py
@@ -0,0 +1,248 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+# Portions taken from <repo>, Copyright Nvidia Corporation
+
+import math
+import os
+import sys
+from typing import Tuple
+
+from validations_wrapper import validate_config
+
+LAUNCHER_SCRIPT_PATH = (
+    f"{os.path.dirname(os.path.abspath(__file__))}/launcher/nemo/nemo_framework_launcher/launcher_scripts/"
+)
+sys.path.append(LAUNCHER_SCRIPT_PATH)
+
+import hydra
+import omegaconf
+from nemo_launcher.core.data_curation_stages import QualityFiltering
+from nemo_launcher.core.data_stages import (
+    CustomDataPreparation,
+    MC4DataPreparation,
+    PileDataPreparation,
+)
+from nemo_launcher.core.export_stages import Export
+from nemo_launcher.core.rlhf_stages import RLHFPPO, RLHFRewardModel
+from nemo_launcher.core.stages import (
+    PEFT,
+    AdapterLearning,
+    Conversion,
+    EvalHarnessEvaluation,
+    FineTuning,
+    IA3Learning,
+    NeMoEvaluation,
+    PromptLearning,
+)
+
+from launcher.accelerator_devices import (
+    get_num_accelerator_devices,
+    get_num_cores_per_accelerator,
+)
+from launcher.nemo.recipe_stages import (
+    NeMoTraining,
+    SMTrainingGPURecipe,
+    SMTrainingTrainiumRecipe,
+)
+from launcher.nemo.stages import (
+    SMCustomTrainingCPU,
+    SMCustomTrainingGPU,
+    SMCustomTrainingTrainium,
+    get_instance_type,
+)
+
+omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True)
+omegaconf.OmegaConf.register_new_resolver("divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True)
+omegaconf.OmegaConf.register_new_resolver("divide_floor", lambda x, y: int(math.floor(x / y)), replace=True)
+
+STR2STAGECLASS = {
+    "training": NeMoTraining,
+    "fine_tuning": FineTuning,
+    "peft": PEFT,
+    "prompt_learning": PromptLearning,
+    "adapter_learning": AdapterLearning,
+    "ia3_learning": IA3Learning,
+    "conversion": Conversion,
+    "export": Export,
+    "evaluation": {
+        EvalHarnessEvaluation: ["gpt3", "prompt_gpt3", "llama", "prompt_llama"],
+        NeMoEvaluation: [
+            "t5",
+            "mt5",
+            "prompt_t5",
+            "prompt_mt5",
+            "adapter_t5",
+            "adapter_gpt3",
+            "ia3_t5",
+            "ia3_gpt3",
+            "peft_llama",
+        ],
+    },
+    "data_preparation": {
+        PileDataPreparation: ["gpt3", "t5", "bert", "llama"],
+        MC4DataPreparation: ["mt5"],
+        CustomDataPreparation: ["generic"],
+    },
+    "rlhf_rm": RLHFRewardModel,
+    "rlhf_ppo": RLHFPPO,
+    "quality_filtering": QualityFiltering,
+}
+
+
+def get_training_stage(cfg):
+    """
+    Get the right training stage based on the device type and if it is custom training
+    """
+    instance_type = get_instance_type(cfg)
+    is_custom = cfg.get("training_cfg") is not None
+
+    # p and g instances are GPU instances
+    if instance_type.startswith(("p", "g")):
+        device_type = "gpu"
+    elif instance_type.startswith("trn"):
+        device_type = "trainium"
+    else:
+        device_type = "cpu"
+
+    if not is_custom:
+        if device_type == "gpu":
+            return SMTrainingGPURecipe
+        if device_type == "trainium":
+            return SMTrainingTrainiumRecipe
+        raise ValueError("Recipe only can be run on GPU or Trainium instances")
+    else:
+        if device_type == "gpu":
+            return SMCustomTrainingGPU
+        if device_type == "trainium":
+            return SMCustomTrainingTrainium
+        return SMCustomTrainingCPU
+
+
+def preprocess_config(cfg) -> Tuple[bool, bool]:
+    """
+    Pre-process the configuration passed to the job
+
+    Returns
+    -------
+    Tuple
+        boolean: configuration has a custom script
+        boolean: is it a SageMaker recipe
+    """
+    with omegaconf.open_dict(cfg):
+        cfg.launcher_scripts_path = LAUNCHER_SCRIPT_PATH
+    # Override the cluster type to align with NeMo
+    if cfg.get("cluster_type") is None:
+        assert cfg.get("cluster") is not None
+        cluster_type = cfg.cluster.cluster_type
+    else:
+        cluster_type = cfg.cluster_type
+
+    with omegaconf.open_dict(cfg):
+        if cluster_type == "slurm":
+            cfg.cluster_type = "bcm"
+        else:
+            cfg.cluster_type = cluster_type
+
+    if cfg.get("wandb_api_key_file") is None:
+        with omegaconf.open_dict(cfg):
+            cfg.wandb_api_key_file = None
+
+    if cfg.get("wandb_api_bcp_secret_key") is None:
+        with omegaconf.open_dict(cfg):
+            cfg.wandb_api_bcp_secret_key = None
+
+    if cfg.get("training_cfg") is not None:
+        assert cfg.get("stages") is None, "training_cfg and stages should not set together"
+        stage_cfg = cfg.get("training_cfg")
+        assert stage_cfg.get("run") is not None, "run config should be set"
+        run_config = stage_cfg.get("run")
+
+        if run_config.get("ntasks_per_node") is not None:
+            ntasks_per_node = run_config.get("ntasks_per_node")
+        else:
+            instance_type = get_instance_type(cfg)
+            if instance_type is not None and get_num_accelerator_devices(instance_type) is not None:
+                ntasks_per_node = get_num_accelerator_devices(instance_type) * get_num_cores_per_accelerator(
+                    instance_type
+                )
+            else:
+                ntasks_per_node = 8
+
+        # To align with https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L721
+        with omegaconf.open_dict(stage_cfg):
+            stage_cfg.trainer = {"devices": ntasks_per_node}
+            with omegaconf.open_dict(run_config):
+                run_config.ntasks_per_node = ntasks_per_node
+                run_config.results_dir = f"{cfg.base_results_dir}/{run_config.name}"
+
+        # To align with https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L313C54-L313C72
+        with omegaconf.open_dict(cfg):
+            cfg.training = {"model": {"ub_tp_comm_overlap": False}}
+
+        return True, False
+
+    if cfg.recipes:
+        model_type = cfg.recipes.run.get("model_type", None)
+
+        with omegaconf.open_dict(cfg):
+            cfg.training = cfg.recipes  # Point cfg.training to cfg.recipes to avoid conflict in nemo stages
+        if "hf" in model_type:
+            return False, True
+
+    return False, False
+
+
+@hydra.main(config_path="recipes_collection", config_name="config", version_base="1.2")
+@validate_config
+def main(cfg):
+    has_custom_script, has_sm_recipe = preprocess_config(cfg)
+
+    if has_custom_script:
+        stage_class = get_training_stage(cfg)
+        stage = stage_class(cfg)
+        job_id = stage.run()
+    else:
+        requested_stages = cfg.get("stages") or ["training"]
+        dependency = None
+
+        for stage_name in requested_stages:
+            # Get our training stages
+            if stage_name == "training" and has_sm_recipe:
+                stage_class = get_training_stage(cfg)
+            else:
+                stage_class = STR2STAGECLASS[stage_name]
+            if isinstance(stage_class, dict):
+                stage_config_choice = cfg.get(f"{stage_name}_config")
+                choice_model_type = stage_config_choice.rsplit("/", 1)[0]
+                for cls, model_types in stage_class.items():
+                    if choice_model_type in model_types:
+                        stage_class = cls
+                        break
+
+            if dependency is not None:
+                cfg[stage_name]["run"]["dependency"] = dependency
+
+            stage = stage_class(cfg)
+            job_id = stage.run()
+
+            job_path = stage.get_job_path()
+            command = " \\\n  ".join(sys.argv)
+            with open(job_path.folder / "launcher_cmd.log", "w") as f:
+                f.write(command)
+
+            if job_id:
+                dependency = f"afterany:{job_id}"
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..e20d516
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,15 @@
+[tool.pytest.ini_options]
+minversion = 7.0
+# durations=0 will display all tests execution time, sorted in ascending order starting from from the slowest one.
+# -vv will also display tests with durration = 0.00s
+addopts = [
+    "--cache-clear",
+    "--quiet",
+    "--durations=0",
+    "--cov=launcher/",
+    # uncomment this line to see a detailed HTML test coverage report instead of the usual summary table output to stdout.
+    # "--cov-report=html",
+    "tests/",
+]
+testpaths = ["tests"]
+norecursedirs = [".eggs", ".pytest_cache", "*.egg-info", ".git", "build"]
diff --git a/recipes_collection/cluster/k8s.yaml b/recipes_collection/cluster/k8s.yaml
new file mode 100644
index 0000000..5ed4737
--- /dev/null
+++ b/recipes_collection/cluster/k8s.yaml
@@ -0,0 +1,67 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+pullPolicy: Always # policy to pull container, can be Always, IfNotPresent and Never
+restartPolicy: Never # restart policy
+namespace: default # the namespace to submit job
+# create customized labels for the PytorchJob and Pods deployed jobs.
+# Example:
+#   custom_labels:
+#     label-key-1: label-value-1
+#     label-key-2: label-value-2
+custom_labels: null
+# create customized annotations for the jobs.
+# Example:
+#   annotations:
+#     annotation-key-1: annotation-value-1
+#     annotation-key-2: annotation-value-2
+annotations: null
+# add service account to job pods
+# Example:
+#  serviceAccountName: service_account
+service_account_name: null
+# priorityClassName for Kueue scheduler to decide jobs priority
+priority_class_name: null
+
+# temp volume, usually used to mount temp directory
+# Example:
+#  volumes:
+#    - volumeName: data1
+#      hostPath: "/data"
+#      mountPath: "/data"
+
+volumes: null
+
+# persistent volume, usually used to mount FSx
+# Example:
+# persistent_volume_claims:
+#       - claimName: null
+#         mountPath: null
+#       - claimName: null
+#         mountPath: null
+
+# persistent volumes, usually used to mount FSx
+persistent_volume_claims:
+  - null
+  # This claim should be created before running. Example:
+  # - claimName: fsx-claim
+  #   mountPath: data
+
+# Create k8s NodeAffinity to select nodes to deploy jobs which matches required and preferred labels
+# Structure:
+#   label_selector:
+#     required: <required label key-values pair>
+#     preferred: <preferred label key-values pair>
+#     weights: <weights list used by preferred labels to get nodes priority>
+# Example:
+#   label_selector:
+#     required:
+#       example-label-key:
+#         - expected-label-value-1
+#         - expected-label-value-2
+#     preferred:
+#       preferred-label-key:
+#         - preferred-label-value-1
+#         - preferred-label-value-2
+#     weights:
+#       - 100
+label_selector: null
diff --git a/recipes_collection/cluster/slurm.yaml b/recipes_collection/cluster/slurm.yaml
new file mode 100755
index 0000000..79b8e35
--- /dev/null
+++ b/recipes_collection/cluster/slurm.yaml
@@ -0,0 +1,15 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+exclusive: True
+mem: 0
+job_name_prefix: 'sagemaker-'
+slurm_create_submission_file_only: False # Setting to True if just want to create submission file
+stderr_to_stdout: True # Setting to False to split the stderr and stdout logs
+srun_args:
+  # - "--no-container-mount-home"
+slurm_docker_cfg:
+  docker_args:
+    # - "--runtime=nvidia" # this is required if the docker runtime version is low
+  post_launch_commands: # commands will run after launching the docker container using bash
+container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
+  - null
diff --git a/recipes_collection/cluster/sm_jobs.yaml b/recipes_collection/cluster/sm_jobs.yaml
new file mode 100644
index 0000000..e319002
--- /dev/null
+++ b/recipes_collection/cluster/sm_jobs.yaml
@@ -0,0 +1,31 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+sm_jobs_config:
+  output_path: null  # S3 output path to output artifacts
+  tensorboard_config:
+    output_path: null  # Output path for tensorboard logs
+    container_logs_path: null  # Path to logs on the container
+  wait: True  # Whether to wait for training job to finish
+  inputs:  # Inputs to call fit with. Set either s3 or file_system, not both.
+    s3:  # Dictionary of channel names and s3 URIs. For GPUs, use channels for train and validation.
+        train: null
+        val: null
+    file_system:  # If using file system input, please pass VPC params in additional_estimator_kwargs.
+        id: null
+        type: null
+        directory_path: null
+  additional_estimator_kwargs:  # All other additional args to pass to estimator. Must be int, float or string.
+    max_run: 1800
+    enable_remote_debug: True
+  recipe_overrides: null
diff --git a/recipes_collection/config.yaml b/recipes_collection/config.yaml
new file mode 100755
index 0000000..e8793f3
--- /dev/null
+++ b/recipes_collection/config.yaml
@@ -0,0 +1,35 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+defaults:
+  - _self_
+  - cluster: slurm  # set to `slurm`, `k8s` or `sm_jobs`, depending on the desired cluster
+  - recipes: training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain # select desired config inside the training directory
+  - override hydra/job_logging: stdout
+
+cluster_type: slurm  # bcm, bcp, k8s or sm_jobs. If bcm, k8s or sm_jobs, it must match - cluster above.
+# If using sm_jobs cluster_type, set sm_jobs_config. See cluster/sm_jobs.yaml for example.
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+debug: False
+
+instance_type: p5.48xlarge
+base_results_dir: null  # Location to store the results, checkpoints and logs.
+
+container: null
+
+git:
+  repo_url_or_path: null
+  branch: null
+  commit: null
+  entry_script: null
+  token: null
+
+env_vars:
+  NCCL_DEBUG: WARN  # Logging level for NCCL. Set to "INFO" for debug information
+
+# Do not modify below, use the values above instead.
+training_config: ${hydra:runtime.choices.recipes}
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora.yaml
new file mode 100644
index 0000000..57f0296
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq128k_gpu_qlora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-405b-seq131072
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 2
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 16
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 131072
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 126
+  hidden_size: 16384
+  num_attention_heads: 128
+  intermediate_size: 53248
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: qlora_4bit
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora.yaml
new file mode 100644
index 0000000..a46cb94
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_lora.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-405b-lora
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 6
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: True
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # 405B LoRA does not support export_full_model.
+    # Instead, use the merge-peft-checkpoint script after training.
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: False
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 48
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 126
+  hidden_size: 16384
+  num_attention_heads: 128
+  intermediate_size: 53248
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora.yaml
new file mode 100644
index 0000000..0878454
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq16k_gpu_qlora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-405b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 2
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 126
+  hidden_size: 16384
+  num_attention_heads: 128
+  intermediate_size: 53248
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: qlora_4bit
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora.yaml
new file mode 100644
index 0000000..ef6e9d9
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_lora.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-405b-lora
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 6
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: True
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # 405B LoRA does not support export_full_model.
+    # Instead, use the merge-peft-checkpoint script after training.
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: False
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 48
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 126
+  hidden_size: 16384
+  num_attention_heads: 128
+  intermediate_size: 53248
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora.yaml
new file mode 100644
index 0000000..17616b3
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_405b_seq8k_gpu_qlora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-405b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 2
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 126
+  hidden_size: 16384
+  num_attention_heads: 128
+  intermediate_size: 53248
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: qlora_4bit
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning.yaml
new file mode 100644
index 0000000..2661164
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_fine_tuning.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora.yaml
new file mode 100644
index 0000000..833e6cb
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq16k_gpu_lora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 2
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning.yaml
new file mode 100644
index 0000000..f2d216f
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_fine_tuning.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 10
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 40
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora.yaml
new file mode 100644
index 0000000..d96347d
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_70b_seq8k_gpu_lora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning.yaml
new file mode 100644
index 0000000..e76bec6
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_fine_tuning.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora.yaml
new file mode 100644
index 0000000..a32fe19
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq16k_gpu_lora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning.yaml
new file mode 100644
index 0000000..52f4d6a
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_fine_tuning.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora.yaml
new file mode 100644
index 0000000..be65805
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_gpu_lora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_trn1_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_trn1_fine_tuning.yaml
new file mode 100644
index 0000000..950688c
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/hf_llama3_8b_seq8k_trn1_fine_tuning.yaml
@@ -0,0 +1,129 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: llama-8b-sft
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: neuron-hf
+  compile: 0
+
+name: hf_llama
+model_source: hf
+seed: 1234
+
+trainer:
+  devices: 32
+  num_nodes: 1
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 5000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 500 # we do not want val to run during training, hence setting it at a high number
+  check_val_every_n_epoch: null
+  num_sanity_val_steps: 0
+  limit_val_batches: 0.0
+  limit_test_batches: 0.0
+  gradient_clip_val: 1.0
+
+exp_manager:
+  log_local_rank_0_only: True # reduce file system access pressure
+  create_tensorboard_logger: True
+  explicit_log_dir: null
+  exp_dir: null
+  name: hf_llama
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: step
+    save_top_k: 1
+    mode: max
+    save_last: False
+    filename: 'hf_llama3_8B_SFT--{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${...distributed_strategy.tensor_model_parallel_size}, ${...distributed_strategy.pipeline_model_parallel_size}}
+    every_n_train_steps: -1 # -1 or 0 disables checkpointing
+  log_parameter_norm: True # Logs parameter norm across model parallel ranks
+  log_gradient_norm: True # Logs gradient norm across model parallel ranks
+  enable_recovery_time_instrumentation: False # default to not printing the detailing timing for recovery
+  save_xser: True
+  load_xser: True
+  save_bf16: False
+  async_checkpointing: False # default to not use async checkpointing
+  # resume_from_checkpoint: /home/ubuntu/pretrained_ckpt/ # manually set the checkpoint file to load from. [SFT change 1 required]
+  resume_from_checkpoint: null # manually set the checkpoint file to load from. [SFT change 1 required]
+
+distributed_strategy:
+  tensor_model_parallel_size: 32
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: 1
+  zero1: True
+  sequence_parallel: True
+  kv_replicator: 4
+
+data:
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 64
+  packing: True # [SFT] used for appending multiple records in a single record until seq length supported by model, if false uses pad tokens till seq length. True increases throughput
+  use_sft_style_data_module: True # [SFT/PT] used when we want to set the HF style dataloader for megatron model code.
+  train_dir: null # [SFT] '/ubuntu/training.jsonl' or arrow file. As for SFT we use HF style dataloader, we also use HF style data file paths [SFT change 2 required]
+  val_dir: null # [SFT] '/ubuntu/training.jsonl' or arrow file [SFT change 3 required]
+  dev_choose_samples: 2250 # [SFT] if set, will use those many number of records from the head of the dataset instead of using all. Set to null to use full dataset [SFT change 4 required]
+  seq_length: 4096 # [SFT change 5 required]
+  tokenizer:
+    type: null # [SFT change 6 required]
+
+model:
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  # micro_batch_size: 4 # limited by GPU memory
+  # global_batch_size: 8 # will use more micro batches to reach global batch size
+
+  # model architecture
+  model_config: null
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 32
+  hidden_size: 4096
+  qkv_linear: True
+  rope_theta: 500000.0
+
+  # Miscellaneous
+  use_cpu_initialization: True # Init weights on the CPU (slow for large models)
+  weight_init_only: True # [SFT] Load only model states and ignore the optim states from ckpt directory
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: selective # 'selective' or 'full'
+
+  fusions:
+    softmax: True
+    flash_attention: True
+
+  do_layer_norm_weight_decay: True
+
+  optim:
+    name: adamw_fp32OptState
+    lr: 1.5e-4
+    weight_decay: 0.01
+    capturable: False
+    betas:
+    - 0.9
+    - 0.999
+    sched:
+      name: LinearAnnealingWithWarmUp
+      warmup_steps: 10
+      max_steps: ${....trainer.max_steps}
+
+precision:
+  type: 'mixed_precision' # ['bf16SR', 'fp32', 'autocast', 'mixed_precision', 'mixed_precisionSR', 'manual']
+  # Set the following only if precision type is manual, otherwise they will be automatically set.
+  master_weights: False
+  fp32_grad_acc: False
+  xla_use_bf16: '0'
+  xla_downcast_bf16: '0'
+  neuron_rt_stochastic_rounding_en: '0'
+
+compiler_flags: '--model-type transformer --distribution-strategy=llm-training'
+compiler_cache_url: null
+aync_exec_max_inflight_requests: 5
+bucket_size_collectives: 1024
+neuron_rt_exec_timeout: 100
+neuron_experimental_compress_rg: False
diff --git a/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning.yaml
new file mode 100644
index 0000000..6b66c6e
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_fine_tuning.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: True
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: null
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora.yaml
new file mode 100644
index 0000000..4b3a23f
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_70b_seq8k_gpu_lora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 20
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 160
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: True
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: null
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning.yaml b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning.yaml
new file mode 100644
index 0000000..e6a1778
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_fine_tuning.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 32
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: True
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: null
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora.yaml b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora.yaml
new file mode 100644
index 0000000..c0070b4
--- /dev/null
+++ b/recipes_collection/recipes/fine-tuning/llama/p4_hf_llama3_8b_seq8k_gpu_lora.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: False # Disable sagemaker model parallelism for PEFT
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: False
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: True
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False # PEFT does not support fp8
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: null
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: True
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: lora
+    rank: 32
+    alpha: 16
+    dropout: 0.1
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/custom_model/falcon.yaml b/recipes_collection/recipes/training/custom_model/falcon.yaml
new file mode 100644
index 0000000..5aa17c2
--- /dev/null
+++ b/recipes_collection/recipes/training/custom_model/falcon.yaml
@@ -0,0 +1,109 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: falcon-7b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 2
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 10
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+
+use_smp_model: False #enable SMP
+distributed_backend: nccl
+
+
+# Start training from pretrained model
+model:
+  model_type: falcon
+  do_finetune: False
+  hf_model_name_or_path: "tiiuae/falcon-7b"
+  hf_access_token: None
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  use_flash_attention: True
+  activation_checkpointing: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 2048
+  precision: bf16
+  lr_decay_iters: 47683
+  log_reduced_training_loss: True
+
+  # PEFT
+  peft:
+    peft_type: null # lora
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 2e-5
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+    zipped_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml
new file mode 100644
index 0000000..f08c9f8
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain.yaml
@@ -0,0 +1,108 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: llama3.2-11b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+
+  val_check_interval: 1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${..exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: False
+
+
+use_smp_model: False #enable SMP
+distributed_backend: nccl
+
+
+# Start training from pretrained model
+model:
+  model_type: llama_v3
+  do_finetune: False
+  hf_model_name_or_path: "meta-llama/Llama-3.2-11B-Vision-Instruct"
+  hf_access_token: null # Must be set by user
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  use_flash_attention: True
+  activation_checkpointing: True
+  multi_modal: True
+  delayed_param: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 32
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 8192
+  precision: bf16
+  lr_decay_iters: 47683
+  log_reduced_training_loss: True
+
+  # PEFT
+  peft:
+    peft_type: null # lora
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 2e-5
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+    tokenizer_name: null
+    zipped_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.yaml
new file mode 100644
index 0000000..fdc1554
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_2_1b_seq8k_gpu_p5x1_pretrain.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama3.2-1b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: False
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 16
+  hidden_size: 2048
+  num_attention_heads: 32
+  intermediate_size: 8192
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 32.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  tie_word_embeddings: true
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: True
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.yaml
new file mode 100644
index 0000000..4ed420a
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_2_3b_seq8k_gpu_p5x1_pretrain.yaml
@@ -0,0 +1,146 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama3.2-3b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: False
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 28
+  hidden_size: 3072
+  num_attention_heads: 24
+  intermediate_size: 8192
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 32.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  tie_word_embeddings: true
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: True
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..5d31f85
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_2_90b_seq8k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,108 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: llama3.2-90b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+
+  val_check_interval: 1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${..exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: False
+
+
+use_smp_model: False #enable SMP
+distributed_backend: nccl
+
+
+# Start training from pretrained model
+model:
+  model_type: llama_v3
+  do_finetune: False
+  hf_model_name_or_path: "meta-llama/Llama-3.2-90B-Vision-Instruct"
+  hf_access_token: null
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  use_flash_attention: True
+  activation_checkpointing: True
+  multi_modal: True
+  delayed_param: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 8192
+  precision: bf16
+  lr_decay_iters: 47683
+  log_reduced_training_loss: True
+
+  # PEFT
+  peft:
+    peft_type: null # lora
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-4
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 2e-5
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+    tokenizer_name: null
+    zipped_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..48cfcc0
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,143 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 2
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml
new file mode 100644
index 0000000..f5f311f
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq16k_gpu_p5x64_pretrain.yaml
@@ -0,0 +1,143 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 64
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 2
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..4f681b5
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,143 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml
new file mode 100644
index 0000000..c2e8e4b
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_gpu_p5x64_pretrain.yaml
@@ -0,0 +1,143 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 64
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain.yaml
new file mode 100644
index 0000000..255c556
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_70b_seq8k_trn1x16_pretrain.yaml
@@ -0,0 +1,121 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: llama3-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: neuron-hf
+  compile: 0
+
+name: hf_llama
+model_source: hf
+seed: 1234
+
+trainer:
+  devices: 32
+  num_nodes: 16
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 30000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 30001 # we do not want val to run during training, hence setting it at a high number
+  check_val_every_n_epoch: null
+  num_sanity_val_steps: 0
+  limit_val_batches: 0.0
+  limit_test_batches: 0.0
+  gradient_clip_val: 1.0
+
+exp_manager:
+  log_local_rank_0_only: True # reduce file system access pressure
+  create_tensorboard_logger: True
+  explicit_log_dir: null
+  exp_dir: null
+  name: hf_llama
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: step
+    save_top_k: 1
+    mode: max
+    save_last: False
+    filename: 'hf_llama3_70B--{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${...distributed_strategy.tensor_model_parallel_size}, ${...distributed_strategy.pipeline_model_parallel_size}}
+    every_n_train_steps: -1 # -1 or 0 disables checkpointing
+  log_parameter_norm: True # Logs parameter norm across model parallel ranks
+  log_gradient_norm: True # Logs gradient norm across model parallel ranks
+  enable_recovery_time_instrumentation: False # default to not printing the detailing timing for recovery
+  save_xser: True
+  load_xser: True
+  save_bf16: False
+  async_checkpointing: False # default to not use async checkpointing
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+
+distributed_strategy:
+  tensor_model_parallel_size: 32
+  pipeline_model_parallel_size: 8
+  virtual_pipeline_model_parallel_size: 1
+  zero1: True
+  sequence_parallel: True
+  kv_replicator: 4
+
+data:
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1024
+  train_dir: null
+
+model:
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+
+  # model architecture
+  model_config: null
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 80
+  hidden_size: 8192
+  qkv_linear: True
+  rope_theta: 500000.0
+
+  # Miscellaneous
+  use_cpu_initialization: True # Init weights on the CPU (slow for large models)
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: selective # 'selective' or 'full'
+  activations_checkpoint_recompute_mlp: True # will be restructured in future
+
+  fusions:
+    softmax: True
+    flash_attention: True
+
+  do_layer_norm_weight_decay: True
+
+  optim:
+    name: adamw
+    lr: 0.000015
+    weight_decay: 0.1
+    capturable: False
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      max_steps: ${....trainer.max_steps}
+      min_lr: 1e-06
+      constant_steps: 0
+
+precision:
+  type: 'mixed_precision' # ['bf16SR', 'fp32', 'autocast', 'mixed_precision', 'mixed_precisionSR', 'manual']
+  # Set the following only if precision type is manual, otherwise they will be automatically set.
+  master_weights: False
+  fp32_grad_acc: False
+  xla_use_bf16: '0'
+  xla_downcast_bf16: '0'
+  neuron_rt_stochastic_rounding_en: '0'
+
+compiler_flags: '--model-type transformer'
+compiler_cache_url: null # no-op, defined by https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train_setup.sh#L30
+aync_exec_max_inflight_requests: 5
+bucket_size_collectives: 1024
+neuron_rt_exec_timeout: 100
+neuron_experimental_compress_rg: False
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml
new file mode 100644
index 0000000..84bca86
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x16_pretrain.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..8cbb1f8
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq16k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain.yaml
new file mode 100644
index 0000000..a9ff3b5
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..afed3d9
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving /distributed training configs
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # FP8 config
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain.yaml b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain.yaml
new file mode 100644
index 0000000..bcee6ac
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain.yaml
@@ -0,0 +1,118 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: llama3-8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: neuron-hf
+  compile: 0
+
+name: hf_llama
+model_source: hf
+seed: 1234
+
+trainer:
+  devices: 32
+  num_nodes: 4
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
+  max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 10001 # we do not want val to run during training, hence setting it at a high number
+  check_val_every_n_epoch: null
+  num_sanity_val_steps: 0
+  limit_val_batches: 0.0
+  limit_test_batches: 0.0
+  gradient_clip_val: 1.0
+
+exp_manager:
+  log_local_rank_0_only: True # reduce file system access pressure
+  create_tensorboard_logger: True
+  explicit_log_dir: null
+  exp_dir: null
+  name: hf_llama
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: step
+    save_top_k: 1
+    mode: max
+    save_last: False
+    filename: 'hf_llama3_8B--{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${...distributed_strategy.tensor_model_parallel_size}, ${...distributed_strategy.pipeline_model_parallel_size}}
+    every_n_train_steps: 10
+  log_parameter_norm: True # Logs parameter norm across model parallel ranks
+  log_gradient_norm: True # Logs gradient norm across model parallel ranks
+  enable_recovery_time_instrumentation: False # default to not printing the detailing timing for recovery
+  save_xser: True
+  load_xser: True
+  save_bf16: False
+  async_checkpointing: False # default to not use async checkpointing
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+
+distributed_strategy:
+  tensor_model_parallel_size: 32
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: 1
+  zero1: True
+  sequence_parallel: True
+  kv_replicator: 4
+
+data:
+  micro_batch_size: 1 # limited by GPU memory
+  global_batch_size: 1024
+  train_dir: null
+
+model:
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+
+  # model architecture
+  model_config: null
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 32
+  hidden_size: 4096
+  qkv_linear: True
+  rope_theta: 500000.0
+
+  # Miscellaneous
+  use_cpu_initialization: True # Init weights on the CPU (slow for large models)
+
+  ## Activation Checkpointing
+  activations_checkpoint_granularity: selective # 'selective' or 'full'
+
+  fusions:
+    softmax: True
+    flash_attention: True
+
+  do_layer_norm_weight_decay: True
+
+  optim:
+    name: adamw_fp32OptState
+    lr: 1.5e-4
+    weight_decay: 0.01
+    capturable: False
+    betas:
+    - 0.9
+    - 0.999
+    sched:
+      name: LinearAnnealingWithWarmUp
+      warmup_steps: 100
+      max_steps: ${....trainer.max_steps}
+
+precision:
+  type: 'mixed_precision' # ['bf16SR', 'fp32', 'autocast', 'mixed_precision', 'mixed_precisionSR', 'manual']
+  # Set the following only if precision type is manual, otherwise they will be automatically set.
+  master_weights: False
+  fp32_grad_acc: False
+  xla_use_bf16: '0'
+  xla_downcast_bf16: '0'
+  neuron_rt_stochastic_rounding_en: '0'
+
+compiler_flags: '--model-type transformer --distribution-strategy=llm-training'
+compiler_cache_url: null # no-op, defined by https://github.com/aws-neuron/neuronx-distributed-training/blob/main/examples/train_setup.sh#L30
+aync_exec_max_inflight_requests: 5
+bucket_size_collectives: 1024
+neuron_rt_exec_timeout: 100
+neuron_experimental_compress_rg: False
diff --git a/recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml b/recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml
new file mode 100644
index 0000000..5d27b39
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/megatron_llama3_1_8b_nemo.yaml
@@ -0,0 +1,168 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Referred from https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/main/launcher_scripts/conf/training/llama/llama3_1_8b.yaml
+run:
+  name: llama3_1_8b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "0-01:30:00"
+  dependency: "singleton"
+trainer:
+  num_nodes: 16
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: null
+  max_steps: 300000 # consumed_samples = global_step * global_batch_size
+  max_time: "05:23:30:00" # days:hours:minutes:seconds
+  log_every_n_steps: 10
+  val_check_interval: 2000
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: ${training.run.results_dir}/results
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: ${training.run.name}
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
+    filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
+  log_step_timing: True
+  step_timing_kwargs:
+    sync_cuda: True
+    buffer_size: 5
+
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 2048
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  context_parallel_size: 1
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 14336
+  num_attention_heads: 32
+  num_query_groups: 8
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  cross_entropy_loss_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  scale_positional_embedding: true
+  tokenizer:
+    library: 'sentencepiece'
+    type: null
+    model: <path_to_my_model>/tokenizer.model
+    delimiter: null
+    vocab_file: null
+    merge_file: null
+    sentencepiece_legacy: False
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+  deterministic_mode: false
+
+  ## Transformer Engine
+  transformer_engine: true
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
+  fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  use_emha: False
+  ub_tp_comm_overlap: False
+  use_flash_attention: true
+  optim:
+    name: distributed_fused_adam
+    lr: 1e-4
+    weight_decay: 0.1
+    betas:
+      - 0.9
+      - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1e-5
+  data:
+    data_impl: mock
+    splits_string: 99990,8,2
+    seq_length: ${training.model.encoder_seq_length}
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: true
+    reset_attention_mask: true
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix: []
diff --git a/recipes_collection/recipes/training/llama/p4_hf_llama3_70b_seq8k_gpu.yaml b/recipes_collection/recipes/training/llama/p4_hf_llama3_70b_seq8k_gpu.yaml
new file mode 100644
index 0000000..babeee7
--- /dev/null
+++ b/recipes_collection/recipes/training/llama/p4_hf_llama3_70b_seq8k_gpu.yaml
@@ -0,0 +1,144 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+# Basic run information configs
+run:
+  name: llama-70b
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+# Basic pytorch lightning trainer config
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+# Basic pytorch lightning experiment config
+# Config for checkpoint/tensorboard etc
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  # Configs to save checkpoint with a fixed interval
+  # Note: These config will not work with auto checkpoint mode
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Enable auto_checkpoint to automatically calculate the checkpoint interval and resume from checkpoint
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+################# Predefined configs ##########################
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+
+# Model training configs
+model:
+  model_type: llama_v3
+  # Base configs
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: True
+
+  # Memory saving / distributed training configs
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: False
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: True
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: True
+  use_orig_param: False
+
+  # FP8 config
+  fp8: False
+
+  # Model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 80
+  hidden_size: 8192
+  num_attention_heads: 64
+  intermediate_size: 28672
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: True
+  rope_theta: 500000.0
+
+  # rope scaling for llama3
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+
+  # Finetuning config
+  do_finetune: False
+  # The path to resume from, needs to be HF compatible
+  hf_model_name_or_path: null
+  hf_access_token: null
+  # PEFT config
+  peft:
+    peft_type: null # lora
+
+  precision: ${recipes.trainer.precision}
+  ################# End of Predefined configs ##########################
+
+  # Learning rate and optimizer configs
+  lr_decay_iters: ${recipes.trainer.max_steps}
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 2e-5
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 2e-6
+
+  # Data configs
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Profiling configs
+  # Viztracer profiling options
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml
new file mode 100644
index 0000000..f401d22
--- /dev/null
+++ b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x16_pretrain.yaml
@@ -0,0 +1,127 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mistral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mistral
+
+  train_batch_size: 1
+  val_batch_size: 1
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mistral_sliding_window: 4096
+  rms_norm_eps: 1e-5
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..84b7321
--- /dev/null
+++ b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq16k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,127 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mistral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mistral
+
+  train_batch_size: 1
+  val_batch_size: 1
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mistral_sliding_window: 4096
+  rms_norm_eps: 1e-5
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml
new file mode 100644
index 0000000..75da0df
--- /dev/null
+++ b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x16_pretrain.yaml
@@ -0,0 +1,127 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mistral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mistral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mistral_sliding_window: 4096
+  rms_norm_eps: 1e-5
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..31c777b
--- /dev/null
+++ b/recipes_collection/recipes/training/mistral/hf_mistral_7b_seq8k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,127 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mistral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mistral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  moe: False
+  activation_checkpointing: False
+  activation_loading_horizon: 1
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: hybrid_shard
+  forward_prefetch: True
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: False
+
+  # model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mistral_sliding_window: 4096
+  rms_norm_eps: 1e-5
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: True
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..ff9c6e4
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,133 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 1
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 56
+  hidden_size: 6144
+  num_attention_heads: 48
+  intermediate_size: 16384
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32768
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml
new file mode 100644
index 0000000..51b5d4a
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq16k_gpu_p5x64_pretrain.yaml
@@ -0,0 +1,132 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 64
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 1
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 56
+  hidden_size: 6144
+  num_attention_heads: 48
+  intermediate_size: 16384
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32768
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..84dd89e
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,133 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 56
+  hidden_size: 6144
+  num_attention_heads: 48
+  intermediate_size: 16384
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32768
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml
new file mode 100644
index 0000000..1340801
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x22b_seq8k_gpu_p5x64_pretrain.yaml
@@ -0,0 +1,131 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 64
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 256
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 56
+  hidden_size: 6144
+  num_attention_heads: 48
+  intermediate_size: 16384
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32768
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml
new file mode 100644
index 0000000..45879bc
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x16_pretrain.yaml
@@ -0,0 +1,133 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 128
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..7e19519
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq16k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,133 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 128
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 16384
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml
new file mode 100644
index 0000000..77a5a22
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x16_pretrain.yaml
@@ -0,0 +1,133 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 2
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 128
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml
new file mode 100644
index 0000000..95ff479
--- /dev/null
+++ b/recipes_collection/recipes/training/mixtral/hf_mixtral_8x7b_seq8k_gpu_p5x32_pretrain.yaml
@@ -0,0 +1,133 @@
+# Original Copyright (c), NVIDIA CORPORATION. Modifications © Amazon.com
+
+run:
+  name: mixtral
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "6-00:00:00"
+  model_type: hf # huggingface for our recipes
+
+
+trainer:
+  devices: 8
+  num_nodes: 32
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0 # Number of batches per each validation run, set to 0 to disable validation.
+
+
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: True
+  create_checkpoint_callback: False
+  checkpoint_callback_params:
+    # Set save_top_k = 0 to disable sharded checkpointing
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: "step"
+    mode: "max"
+    save_last: False
+  checkpoint_dir: ${recipes.exp_manager.exp_dir}/checkpoints/
+  resume_from_checkpoint: null
+  # Set auto_checkpoint = False to disable auto resilience checkpointing
+  auto_checkpoint:
+    enabled: False
+  export_full_model:
+    # Set every_n_train_steps = 0 to disable full checkpointing
+    every_n_train_steps: 0
+    save_last: True
+
+use_smp_model: True # Enable sagemaker model parallelism
+distributed_backend: nccl
+
+# Start training from pretrained model
+model:
+  model_type: mixtral
+
+  train_batch_size: 4
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: True
+  activation_checkpointing: True
+  activation_loading_horizon: 2
+  delayed_param: True
+  offload_activations: False
+  seed: 12345
+  grad_clip: 1.0
+
+
+  # FSDP Configs
+  sharding_strategy: full_shard
+  forward_prefetch: True
+  shard_degree: 128
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: True
+
+  # model architecture
+  max_context_width: 8192
+  max_position_embeddings: ${.max_context_width}
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1e-5
+  vocab_size: 32000
+  num_key_value_heads: 8
+  use_flash_attention: True
+  mixtral_sliding_window: null
+  num_experts_per_tok: 2
+  num_local_experts: 8
+  moe_load_balancing: 'sinkhorn'
+  global_token_shuffle: True
+  moe_all_to_all_dispatcher: False
+  rope_theta: 1000000.0
+
+  # Transformer Engine
+  fp8: False
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+
+  # finetune
+  do_finetune: False
+  # smp calls it `pretrained_model_weights` but we opted to follow the name used by HF
+  hf_model_name_or_path: null
+
+  precision: ${recipes.trainer.precision}
+
+  lr_decay_iters: ${recipes.trainer.max_steps}
+
+  log_reduced_training_loss: True
+
+  # Optimizer
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+
+
+
+  # Data
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: False
+
+  # Viztracer
+  viztracer:
+    enabled: false
diff --git a/requirements.txt b/requirements.txt
new file mode 100755
index 0000000..ee556b0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+hydra-core==1.3.2
+omegaconf>=2.2,<2.3
+pynvml==11.4.1
+requests==2.26.0
+tqdm==4.62.3
+zstandard==0.15.2
+tensorboard==2.12.0
+boto3==1.35.66
diff --git a/scripts/licenseChecker.sh b/scripts/licenseChecker.sh
new file mode 100644
index 0000000..2c49a63
--- /dev/null
+++ b/scripts/licenseChecker.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+check_licenses() {
+    LICENSE_LIST=$(cat ./ApprovedLicenses.txt | tr '\n' '|'| sed 's/|$//')
+    pip-licenses --summary > LicenseSummary.txt
+    awk '{$1=""; print $0}' ./LicenseSummary.txt | tail -n +2 | sed 's/;/\n/g' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//'| sort -u > ./newLicenseSummary.txt
+    while IFS= read -r line || [[ -n "$line" ]]; do
+        if ! echo "$LICENSE_LIST" | grep -q "$line"; then
+            echo "License '$line' is not in the allowed list."
+            exit 1
+        fi
+    done < ./newLicenseSummary.txt
+
+    if ! grep -q "prohibited-license: Did not find content matching specified patterns" ./scanOutput.txt; then
+        echo "Prohibited License Used in Source Code Scan: "
+        sed -n '/⚠  prohibited-license:/,/⚠  third-party-license-file:/p' ./scanOutput.txt | sed '1d;$d'| cat
+        exit 1
+    fi
+    echo "License Check complete"
+}
+
+check_licenses
diff --git a/template/sm_jobs.py b/template/sm_jobs.py
new file mode 100644
index 0000000..7d7eccf
--- /dev/null
+++ b/template/sm_jobs.py
@@ -0,0 +1,140 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import argparse
+import logging
+import os
+
+import omegaconf
+import sagemaker
+from omegaconf import OmegaConf
+from sagemaker.debugger import TensorBoardOutputConfig
+from sagemaker.inputs import FileSystemInput
+from sagemaker.interactive_apps import SupportedInteractiveAppTypes
+from sagemaker.pytorch import PyTorch
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    script_dir = os.path.dirname(os.path.join(os.path.realpath(__file__)))
+    parser = argparse.ArgumentParser(description="Launch training recipe using SM jobs")
+    parser.add_argument(
+        "--recipe", type=str, default=os.path.join(script_dir, "recipe.yaml"), help="Path to recipe config."
+    )
+    parser.add_argument(
+        "--sm_jobs_config",
+        type=str,
+        default=os.path.join(script_dir, "sm_jobs_config.yaml"),
+        help="Path to sm jobs config.",
+    )
+    parser.add_argument("--job_name", type=str, required=True, help="Job name for the SDK job.")
+    parser.add_argument("--instance_type", type=str, required=True, help="Instance type to use for the training job.")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    sagemaker_session = sagemaker.Session()
+    role = sagemaker.get_execution_role()
+
+    sm_jobs_config = OmegaConf.load(args.sm_jobs_config)
+    recipe_overrides = sm_jobs_config.get("recipe_overrides", omegaconf.DictConfig(dict()))
+    recipe = OmegaConf.load(args.recipe)
+    recipe = OmegaConf.merge(recipe, recipe_overrides)
+    recipe_overrides = OmegaConf.to_container(recipe_overrides)
+
+    sm_inputs = sm_jobs_config.get("inputs")
+    inputs = None
+    if sm_inputs:
+        s3 = sm_inputs.get("s3")
+        file_system = sm_inputs.get("file_system")
+        if s3 and file_system:
+            raise ValueError("Must set only one of s3 or file_system in sm_jobs_config.inputs.")
+        if s3 is None and file_system is None:
+            raise ValueError("Must set either s3 or file_system in sm_jobs_config.inputs.")
+        if s3:
+            inputs = OmegaConf.to_container(s3)
+        else:
+            file_system_id = file_system.get("id")
+            file_system_type = file_system.get("type")
+            directory_path = file_system.get("directory_path")
+            if file_system_id is None or file_system_type is None or directory_path is None:
+                raise ValueError("Must set id, type and directory_path for file_system input type in sm_jobs_config.")
+            inputs = FileSystemInput(
+                file_system_id=file_system_id,
+                file_system_type=file_system_type,
+                directory_path=directory_path,
+                file_system_access_mode="ro",
+            )
+
+    output_path = sm_jobs_config.get("output_path")
+    if output_path is None:
+        raise ValueError("Expected output_path to be set with sm_jobs cluster type")
+
+    additional_estimator_kwargs = sm_jobs_config.get("additional_estimator_kwargs", omegaconf.DictConfig(dict()))
+    additional_estimator_kwargs = OmegaConf.to_container(additional_estimator_kwargs)
+
+    tensorboard_config = sm_jobs_config.get("tensorboard_config")
+    if tensorboard_config:
+        tb_output_path = tensorboard_config.get("output_path")
+        tb_container_path = tensorboard_config.get("container_logs_path")
+        if tb_output_path is None or tb_container_path is None:
+            raise ValueError("Please set output path and container path when using tensorboard.")
+        tensorboard_output_config = TensorBoardOutputConfig(
+            s3_output_path=tb_output_path, container_local_output_path=tb_container_path
+        )
+        additional_estimator_kwargs["tensorboard_output_config"] = tensorboard_output_config
+        if recipe.get("exp_manager") is None or recipe.get("exp_manager", dict()).get("explicit_log_dir") is None:
+            logger.warning("Using tensorboard but not set exp_manager -> explicit_log_dir for recipe.")
+
+    base_job_name = args.job_name.replace(".", "-")
+    base_job_name = base_job_name.replace("_", "-")
+    estimator = PyTorch(
+        base_job_name=base_job_name,
+        instance_type=args.instance_type,
+        training_recipe=args.recipe,
+        recipe_overrides=recipe_overrides,
+        output_path=output_path,
+        role=role,
+        sagemaker_session=sagemaker_session,
+        **additional_estimator_kwargs,
+    )
+
+    if tensorboard_config:
+        logger.info("Tensorboard url:")
+        logger.info(
+            estimator.get_app_url(
+                app_type=SupportedInteractiveAppTypes.TENSORBOARD,
+                open_in_default_web_browser=False,
+            )
+        )
+
+    if not isinstance(inputs, FileSystemInput):
+        keys_to_pop = []
+        for item in inputs.keys():
+            if not inputs[item]:
+                print(f"poping input {inputs[item]}, {item}")
+                keys_to_pop.append(item)
+        for item in keys_to_pop:
+            inputs.pop(item)
+        if len(inputs) == 0:
+            inputs = None
+
+    estimator.fit(inputs=inputs, wait=sm_jobs_config.get("wait", False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/config_validator/test_type_validator.py b/tests/config_validator/test_type_validator.py
new file mode 100644
index 0000000..4eac6e6
--- /dev/null
+++ b/tests/config_validator/test_type_validator.py
@@ -0,0 +1,133 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import pytest
+from omegaconf import OmegaConf
+
+from launcher.config_validator.type_validator import (
+    TypeValidator,
+    _check_types,
+    _is_dict,
+    _is_list_of_dicts,
+    _is_list_of_paths,
+    _is_list_of_strings,
+    _is_positive_integer,
+    _is_valid_path,
+)
+
+# Sample test configuration
+sample_test_config = OmegaConf.create(
+    {
+        "hydra": {"output_subdir": None, "run": {"dir": "."}},
+        "git": {"repo_url_or_path": "https://example.com/repo", "branch": "main", "commit": "abc123"},
+        "training_cfg": {
+            "entry_script": "/path/to/script.py",
+            "script_args": [{"arg1": "value1"}, {"arg2": "value2"}],
+            "run": {"name": "experiment", "nodes": 2, "ntasks_per_node": 4},
+        },
+        "cluster": {
+            "cluster_type": "k8s",
+            "instance_type": "p5.48xlarge",
+            "cluster_config": {
+                "namespace": "default",
+                "custom_labels": {"env": "dev"},
+                "annotations": {"annotation1": "value1"},
+                "priority_class_name": "high",
+                "label_selector": {"key": "value"},
+                "persistentVolumeClaim": {"claimName": "claim1", "mountPath": "/mount/path"},
+                "pullPolicy": "Always",
+                "restartPolicy": "OnFailure",
+            },
+        },
+        "base_results_dir": "/results",
+        "container_mounts": ["/mnt/data", "/mnt/other"],
+        "container": "my_container",
+        "env_vars": {"VAR1": "value1"},
+    }
+)
+
+
+@pytest.fixture
+def sample_config():
+    return sample_test_config
+
+
+def test_type_validator_validation(sample_config):
+    validator = TypeValidator(sample_config)
+    try:
+        validator.validate()
+    except TypeError:
+        pytest.fail("Type validation failed unexpectedly")
+
+
+def test_is_valid_path():
+    assert _is_valid_path("/valid/path") is True
+    assert _is_valid_path(None) is False
+
+
+def test_is_positive_integer():
+    assert _is_positive_integer(1) is True
+    assert _is_positive_integer("10") is True
+    assert _is_positive_integer("0") is False
+    assert _is_positive_integer(-1) is False
+    assert _is_positive_integer("string") is False
+
+
+def test_is_list_of_dicts():
+    assert _is_list_of_dicts(OmegaConf.create([{"key": "value"}, {"key2": "value2"}])) is True
+    assert _is_list_of_dicts(OmegaConf.create([{"key": "value"}, "string"])) is False
+    assert _is_list_of_dicts(OmegaConf.create("string")) is False
+    assert _is_list_of_dicts(OmegaConf.create(None)) is False
+
+
+def test_is_list_of_strings():
+    assert _is_list_of_strings(OmegaConf.create(["string1", "string2"])) is True
+    assert _is_list_of_strings(OmegaConf.create(["string1", 2])) is False
+    assert _is_list_of_strings(OmegaConf.create("string")) is False
+    assert _is_list_of_strings(OmegaConf.create(None)) is False
+
+
+def test_is_list_of_paths():
+    assert _is_list_of_paths(OmegaConf.create(["/valid/path1", "/valid/path2"])) is True
+    assert _is_list_of_paths(OmegaConf.create("string")) is False
+    assert _is_list_of_paths(OmegaConf.create(None)) is False
+
+
+def test_is_dict():
+    assert _is_dict(OmegaConf.create({"key": "value"})) is True
+    assert _is_dict(OmegaConf.create([{"key": "value"}])) is False
+    assert _is_dict(OmegaConf.create(None)) is False
+    assert _is_dict(None) is False
+
+
+def test_check_types():
+    with pytest.raises(TypeError):
+        _check_types(0, "positive_integer", "test_integer")
+
+    with pytest.raises(TypeError):
+        _check_types(["string1", 2], "list_string", "test_list")
+
+    with pytest.raises(TypeError):
+        _check_types("string", "list_dict", "test_list")
+
+    with pytest.raises(TypeError):
+        _check_types("string", "list_path", "test_list")
+
+    with pytest.raises(TypeError):
+        _check_types({"key": "value"}, "string", "test_string")
+
+    with pytest.raises(TypeError):
+        _check_types({"key": "value"}, "path", "test_path")
+
+    with pytest.raises(TypeError):
+        _check_types(OmegaConf.create(None), "dict", "test_dict")
diff --git a/tests/config_validator/test_value_validator.py b/tests/config_validator/test_value_validator.py
new file mode 100644
index 0000000..fce7268
--- /dev/null
+++ b/tests/config_validator/test_value_validator.py
@@ -0,0 +1,152 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import pytest
+from omegaconf import OmegaConf
+
+from launcher.config_validator.value_validator import ValueValidator
+
+# Sample valid configurations
+VALID_CONFIGS = [
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_type": "k8s",
+                "instance_type": "p5.48xlarge",
+                "cluster_config": {
+                    "namespace": "valid-namespace",
+                    "pullPolicy": "Always",
+                    "restartPolicy": "OnFailure",
+                    "persistentVolumeClaims": [{"claimName": "my-claim", "mountPath": "/mount/path"}],
+                },
+            },
+            "training_cfg": {
+                "entry_script": "/path/to/entry_script.py",
+                "run": {"name": "run_name", "nodes": 2},
+                "script_args": [{"arg1": "value1"}],
+            },
+            "container": "my_container",
+            "env_vars": {"VAR1": "value1"},
+        }
+    ),
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_type": "slurm",
+            },
+        },
+    ),
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_type": "sm_jobs",
+            },
+        },
+    ),
+]
+
+# Sample invalid configurations
+INVALID_CONFIGS = [
+    # Missing mandatory argument base_results_dir
+    OmegaConf.create({"cluster": {"cluster_type": "k8s", "cluster_config": {"namespace": "valid-namespace"}}}),
+    # Invalid pull policy
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_type": "k8s",
+                "cluster_config": {"namespace": "valid-namespace", "pullPolicy": "InvalidPolicy"},
+            },
+        }
+    ),
+    # Invalid restart policy
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_type": "k8s",
+                "cluster_config": {"namespace": "valid-namespace", "restartPolicy": "InvalidPolicy"},
+            },
+        }
+    ),
+    # Invalid cluster type
+    OmegaConf.create({"base_results_dir": "/some/path", "cluster": {"cluster_type": "invalid_type"}}),
+    # Invalid namespace
+    OmegaConf.create(
+        {"base_results_dir": "/some/path", "cluster": {"cluster_config": {"namespace": "-invalidnamespace"}}}
+    ),
+    # Missing persistentVolumeClaim arguments
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {"cluster_config": {"namespace": "valid-namespace", "persistentVolumeClaims": [{}]}},
+        }
+    ),
+    # persistentVolumeClaim set to None
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_config": {
+                    "namespace": "valid-namespace",
+                    "persistentVolumeClaims": [{"claimName": None, "mountPath": None}],
+                }
+            },
+        }
+    ),
+    # Missing volume arguments
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {"cluster_config": {"namespace": "valid-namespace", "volumes": [{}]}},
+        }
+    ),
+    # volume arguments set to None
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "cluster": {
+                "cluster_config": {
+                    "namespace": "valid-namespace",
+                    "volumes": [{"hostPath": None, "mountPath": None, "volumeName": None}],
+                }
+            },
+        }
+    ),
+    # Do not support git clone with ssh
+    OmegaConf.create(
+        {
+            "base_results_dir": "/some/path",
+            "git": {"repo_url_or_path": "git@some_repo"},
+        }
+    ),
+]
+
+
+@pytest.mark.parametrize("config", VALID_CONFIGS)
+def test_validate_value_validator_valid_config(config):
+    validator = ValueValidator(config)
+    try:
+        validator.validate()
+    except Exception as e:
+        pytest.fail(f"Validator raised an exception with valid config: {str(e)}")
+
+
+@pytest.mark.parametrize("config", INVALID_CONFIGS)
+def test_validate_value_validator_invalid_config(config):
+    validator = ValueValidator(config)
+    with pytest.raises(ValueError):
+        validator.validate()
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/Chart.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/Chart.yaml
new file mode 100644
index 0000000..5665246
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: Sagemaker Model Training
+name: sagemaker-training
+version: 1.0.0
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/config/llama-8b_hydra.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/config/llama-8b_hydra.yaml
new file mode 100644
index 0000000..4579e2c
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/config/llama-8b_hydra.yaml
@@ -0,0 +1,85 @@
+run:
+  name: llama-8b
+  results_dir: {$results_dir}/llama-8b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 10
+exp_manager:
+  exp_dir: /fsx/exp/
+  name: my_experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 10
+use_smp_model: true
+distributed_backend: smddp
+model:
+  model_type: llama_v3
+  train_batch_size: 4
+  val_batch_size: 1
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  moe: false
+  sequence_parallel: true
+  activation_checkpointing: true
+  activation_loading_horizon: 2
+  delayed_param: true
+  offload_activations: false
+  use_smp_model_flash_attn: false
+  seed: 12345
+  grad_clip: 1.0
+  hf_pretrained_model: null
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: false
+  max_context_width: 2048
+  max_position_embeddings: 2048
+  num_hidden_layers: 8
+  hidden_size: 4096
+  num_attention_heads: 32
+  llama_intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 32000
+  num_key_value_heads: 8
+  transformer_engine: true
+  fp8: false
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  do_finetune: false
+  finetune_with_pretrained_weights: false
+  pretrained_model_weights: null
+  precision: bf16
+  lr_decay_iters: 47683
+  log_reduced_training_loss: true
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 0.000001
+  data:
+    train_dir: <path>/<to>/<data>
+    val_dir: null
+    dataset_type: gpt
+    use_synthetic_data: false
+    zipped_data: true
+cluster_type: k8s
+launcher_scripts_path: {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts/
+data_config: llama-8b
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training-config.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training-config.yaml
new file mode 100644
index 0000000..64e7924
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training-config.yaml
@@ -0,0 +1,8 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: training-config-{{ $config.jobName }}
+data:
+  config.yaml: |-
+  {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }}
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training.yaml
new file mode 100644
index 0000000..b40d89c
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/templates/training.yaml
@@ -0,0 +1,177 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: {{ $config.jobName }}
+  namespace: {{ $config.namespace }}
+  {{- if $config.annotations }}
+  annotations:
+    {{- range $key, $value := $config.annotations }}
+    {{ $key | quote }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+  labels:
+    app: {{ $config.jobName }}
+    {{- if $config.customLabels }}
+    {{- range $key, $value := $config.customLabels }}
+    {{ $key | quote }}: {{ $value | quote }}
+    {{- end}}
+    {{- end }}
+spec:
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: {{ $config.nodes }}
+      template:
+        {{- if $config.customLabels }}
+        metadata:
+          labels:
+            {{- range $key, $value := $config.customLabels }}
+            {{ $key | quote }}: {{ $value | quote }}
+            {{- end }}
+        {{- end }}
+        spec:
+          {{- if $config.priorityClassName }}
+          priorityClassName: {{ $config.priorityClassName }}
+          {{- end}}
+          {{- if $config.serviceAccountName }}
+          serviceAccountName: {{ $config.serviceAccountName }}
+          {{- end }}
+          containers:
+          - name: pytorch
+            image: {{ .Values.image.trainingImage }}
+            env:
+              {{- range $key, $value := $config.envVars }}
+              - name: {{ $key }}
+                value: {{ $value | quote }}
+              {{- end}}
+            command:
+            - /etc/config/train-script.sh
+            imagePullPolicy: {{ .Values.image.pullPolicy }}
+            securityContext:
+              capabilities:
+                add: [ "IPC_LOCK" ]
+            {{- if or (eq $config.device "gpu") (eq $config.device "trainium") (gt (int $config.numEFADevices) 0 ) }}
+            resources:
+              requests:
+                {{- if eq $config.device "gpu" }}
+                nvidia.com/gpu: {{ $config.ntasksPerNode }}
+                {{- end }}
+                {{- if eq $config.device "trainium" }}
+                aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
+                {{- end }}
+                {{- if gt (int $config.numEFADevices) 0 }}
+                vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
+                {{- end }}
+              limits:
+                {{- if eq $config.device "gpu" }}
+                nvidia.com/gpu: {{ $config.ntasksPerNode }}
+                {{- end }}
+                {{- if eq $config.device "trainium" }}
+                aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
+                {{- end }}
+                {{- if gt (int $config.numEFADevices) 0 }}
+                vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
+                {{- end }}
+            {{- end }}
+            volumeMounts:
+            {{- if $config.persistentVolumeClaims }}
+            {{- range $config.persistentVolumeClaims }}
+            - mountPath: {{ .mountPath }}
+              name: {{ .claimName }}-volume
+            {{- end }}
+            {{- end }}
+            {{- if $config.volumes }}
+            {{- range $config.volumes }}
+            - name: {{ .volumeName }}
+              mountPath: {{ .mountPath }}
+            {{- end }}
+            {{- end }}
+            {{- if not $config.customScript }}
+            - mountPath: /config
+              name: training-config
+            {{- end }}
+            - mountPath: /etc/config
+              name: train-script
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /var/log/aws/clusters
+              name: aws-clusters-logs
+              readOnly: true
+          restartPolicy: {{ $config.restartPolicy }}
+
+          {{- if (or $config.labelSelector.required $config.labelSelector.preferred) }}
+          affinity:
+            nodeAffinity:
+            {{- if $config.labelSelector.required }}
+              {{- range $key, $values := $config.labelSelector.required }}
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: {{ $key | quote }}
+                      operator: In
+                      values:
+                        {{- range $values }}
+                        - {{ . | quote }}
+                        {{- end}}
+              {{- end }}
+            {{- end }}
+
+            {{- if $config.labelSelector.preferred }}
+              {{- $index := 0 }}
+              {{- range $key, $values := $config.labelSelector.preferred }}
+              preferredDuringSchedulingIgnoredDuringExecution:
+                - weight: {{ index $config.labelSelector.weights $index }}
+                  preference:
+                    matchExpressions:
+                      - key: {{ $key | quote }}
+                        operator: In
+                        values:
+                          {{- range $values }}
+                          - {{ . | quote }}
+                          {{- end }}
+              {{- $index = add $index 1 }}
+              {{- end }}
+            {{- end }}
+          {{- end }}
+
+          volumes:
+          {{- if $config.persistentVolumeClaims }}
+          {{- range $config.persistentVolumeClaims }}
+          - name: {{ .claimName }}-volume
+            persistentVolumeClaim:
+              claimName: {{ .claimName }}
+          {{- end }}
+          {{- end }}
+          {{- if $config.volumes }}
+          {{- range $config.volumes }}
+            - name: {{ .volumeName }}
+              hostPath:
+                path: {{ .hostPath }}
+                type: Directory
+          {{- end }}
+          {{- end }}
+          {{- if not $config.customScript }}
+          - configMap:
+              name: training-config-{{ $config.jobName }}
+            name: training-config
+          {{- end }}
+          - name: shm
+            hostPath:
+              path: /dev/shm
+              type: Directory
+          - name: aws-clusters-logs
+            hostPath:
+              path: /var/log/aws/clusters
+              type: DirectoryOrCreate
+          - name: train-script
+            configMap:
+              defaultMode: 420
+              items:
+              - key: train-script.sh
+                mode: 365
+                path: train-script.sh
+              {{- if eq $config.device "trainium" }}
+              name: train-script-trn-{{ $config.jobName }}
+              {{- else }}
+              name: train-script-gpu-{{ $config.jobName }}
+              {{- end }}
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/values.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/values.yaml
new file mode 100644
index 0000000..214d812
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/k8s_template/values.yaml
@@ -0,0 +1,46 @@
+image:
+  trainingImage: test_container
+  pullPolicy: Always
+trainingConfig:
+  jobName: llama-8b
+  namespace: default
+  scriptPath: examples/llama/llama_pretrain.py
+  scriptArgs: --config-path=/config --config-name=config.yaml
+  customScript: null
+  annotations: null
+  customLabels: null
+  priority_class_name: null
+  device: gpu
+  numEFADevices: 32
+  numNeuronDevices: null
+  ntasksPerNode: 8
+  nodes: 16
+  restartPolicy: Never
+  wandbKey: nil
+  serviceAccountName: null
+  compile: 0
+  persistentVolumeClaims:
+  - null
+  volumes: null
+  git:
+    repo_url_or_path: https://test_token@github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git
+    branch: test_branch
+    commit: test_commit
+    token: null
+  pre_script: []
+  post_script: []
+  labelSelector:
+    required: null
+    preferred: null
+    weights: null
+  envVars:
+    NCCL_DEBUG: WARN
+    NEMO_LAUNCHER_DEBUG: 1
+    SLURM_NTASKS_PER_NODE: 8
+    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+    FI_PROVIDER: efa
+    NCCL_SOCKET_IFNAME: ^lo,docker0
+    NCCL_IGNORE_DISABLED_P2P: '1'
+    TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
+    TORCH_DIST_INIT_BARRIER: '1'
+    CUDA_DEVICE_MAX_CONNECTIONS: '1'
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_hydra.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_hydra.yaml
new file mode 100644
index 0000000..78be454
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_hydra.yaml
@@ -0,0 +1,102 @@
+run:
+  name: llama-8b
+  results_dir: {$results_dir}/llama-8b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: true
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: true
+use_smp_model: true
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  train_batch_size: 4
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: true
+  tensor_model_parallel_degree: 4
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 2
+  moe: false
+  activation_checkpointing: false
+  activation_loading_horizon: 1
+  delayed_param: true
+  offload_activations: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 16
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: true
+  fp8: true
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  max_context_width: 16384
+  max_position_embeddings: 16384
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: true
+  rope_theta: 500000.0
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+  do_finetune: false
+  hf_model_name_or_path: null
+  peft:
+    peft_type: null
+  precision: bf16
+  lr_decay_iters: 50
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1.0e-06
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_submission.sh b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_submission.sh
new file mode 100644
index 0000000..aef94a1
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/llama-8b/llama-8b_submission.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+helm install --timeout=15m --wait  --namespace default llama-8b {$results_dir}/llama-8b/k8s_template
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/Chart.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/Chart.yaml
new file mode 100644
index 0000000..5665246
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: "1.0"
+description: Sagemaker Model Training
+name: sagemaker-training
+version: 1.0.0
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/templates/training.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/templates/training.yaml
new file mode 100644
index 0000000..b40d89c
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/templates/training.yaml
@@ -0,0 +1,177 @@
+{{ $config := .Values.trainingConfig }}
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: {{ $config.jobName }}
+  namespace: {{ $config.namespace }}
+  {{- if $config.annotations }}
+  annotations:
+    {{- range $key, $value := $config.annotations }}
+    {{ $key | quote }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+  labels:
+    app: {{ $config.jobName }}
+    {{- if $config.customLabels }}
+    {{- range $key, $value := $config.customLabels }}
+    {{ $key | quote }}: {{ $value | quote }}
+    {{- end}}
+    {{- end }}
+spec:
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: {{ $config.nodes }}
+      template:
+        {{- if $config.customLabels }}
+        metadata:
+          labels:
+            {{- range $key, $value := $config.customLabels }}
+            {{ $key | quote }}: {{ $value | quote }}
+            {{- end }}
+        {{- end }}
+        spec:
+          {{- if $config.priorityClassName }}
+          priorityClassName: {{ $config.priorityClassName }}
+          {{- end}}
+          {{- if $config.serviceAccountName }}
+          serviceAccountName: {{ $config.serviceAccountName }}
+          {{- end }}
+          containers:
+          - name: pytorch
+            image: {{ .Values.image.trainingImage }}
+            env:
+              {{- range $key, $value := $config.envVars }}
+              - name: {{ $key }}
+                value: {{ $value | quote }}
+              {{- end}}
+            command:
+            - /etc/config/train-script.sh
+            imagePullPolicy: {{ .Values.image.pullPolicy }}
+            securityContext:
+              capabilities:
+                add: [ "IPC_LOCK" ]
+            {{- if or (eq $config.device "gpu") (eq $config.device "trainium") (gt (int $config.numEFADevices) 0 ) }}
+            resources:
+              requests:
+                {{- if eq $config.device "gpu" }}
+                nvidia.com/gpu: {{ $config.ntasksPerNode }}
+                {{- end }}
+                {{- if eq $config.device "trainium" }}
+                aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
+                {{- end }}
+                {{- if gt (int $config.numEFADevices) 0 }}
+                vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
+                {{- end }}
+              limits:
+                {{- if eq $config.device "gpu" }}
+                nvidia.com/gpu: {{ $config.ntasksPerNode }}
+                {{- end }}
+                {{- if eq $config.device "trainium" }}
+                aws.amazon.com/neurondevice: {{ $config.numNeuronDevices }}
+                {{- end }}
+                {{- if gt (int $config.numEFADevices) 0 }}
+                vpc.amazonaws.com/efa: {{ $config.numEFADevices }}
+                {{- end }}
+            {{- end }}
+            volumeMounts:
+            {{- if $config.persistentVolumeClaims }}
+            {{- range $config.persistentVolumeClaims }}
+            - mountPath: {{ .mountPath }}
+              name: {{ .claimName }}-volume
+            {{- end }}
+            {{- end }}
+            {{- if $config.volumes }}
+            {{- range $config.volumes }}
+            - name: {{ .volumeName }}
+              mountPath: {{ .mountPath }}
+            {{- end }}
+            {{- end }}
+            {{- if not $config.customScript }}
+            - mountPath: /config
+              name: training-config
+            {{- end }}
+            - mountPath: /etc/config
+              name: train-script
+            - mountPath: /dev/shm
+              name: shm
+            - mountPath: /var/log/aws/clusters
+              name: aws-clusters-logs
+              readOnly: true
+          restartPolicy: {{ $config.restartPolicy }}
+
+          {{- if (or $config.labelSelector.required $config.labelSelector.preferred) }}
+          affinity:
+            nodeAffinity:
+            {{- if $config.labelSelector.required }}
+              {{- range $key, $values := $config.labelSelector.required }}
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: {{ $key | quote }}
+                      operator: In
+                      values:
+                        {{- range $values }}
+                        - {{ . | quote }}
+                        {{- end}}
+              {{- end }}
+            {{- end }}
+
+            {{- if $config.labelSelector.preferred }}
+              {{- $index := 0 }}
+              {{- range $key, $values := $config.labelSelector.preferred }}
+              preferredDuringSchedulingIgnoredDuringExecution:
+                - weight: {{ index $config.labelSelector.weights $index }}
+                  preference:
+                    matchExpressions:
+                      - key: {{ $key | quote }}
+                        operator: In
+                        values:
+                          {{- range $values }}
+                          - {{ . | quote }}
+                          {{- end }}
+              {{- $index = add $index 1 }}
+              {{- end }}
+            {{- end }}
+          {{- end }}
+
+          volumes:
+          {{- if $config.persistentVolumeClaims }}
+          {{- range $config.persistentVolumeClaims }}
+          - name: {{ .claimName }}-volume
+            persistentVolumeClaim:
+              claimName: {{ .claimName }}
+          {{- end }}
+          {{- end }}
+          {{- if $config.volumes }}
+          {{- range $config.volumes }}
+            - name: {{ .volumeName }}
+              hostPath:
+                path: {{ .hostPath }}
+                type: Directory
+          {{- end }}
+          {{- end }}
+          {{- if not $config.customScript }}
+          - configMap:
+              name: training-config-{{ $config.jobName }}
+            name: training-config
+          {{- end }}
+          - name: shm
+            hostPath:
+              path: /dev/shm
+              type: Directory
+          - name: aws-clusters-logs
+            hostPath:
+              path: /var/log/aws/clusters
+              type: DirectoryOrCreate
+          - name: train-script
+            configMap:
+              defaultMode: 420
+              items:
+              - key: train-script.sh
+                mode: 365
+                path: train-script.sh
+              {{- if eq $config.device "trainium" }}
+              name: train-script-trn-{{ $config.jobName }}
+              {{- else }}
+              name: train-script-gpu-{{ $config.jobName }}
+              {{- end }}
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/values.yaml b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/values.yaml
new file mode 100644
index 0000000..136464f
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/k8s_template/values.yaml
@@ -0,0 +1,44 @@
+image:
+  trainingImage: test_container
+  pullPolicy: Always
+trainingConfig:
+  jobName: test_custom
+  namespace: default
+  scriptPath: test.py
+  scriptArgs: '--some_args debug --some_other_args 1 '
+  customScript: true
+  annotations: null
+  customLabels: null
+  priority_class_name: null
+  device: gpu
+  numEFADevices: 32
+  numNeuronDevices: null
+  ntasksPerNode: 8
+  nodes: 8
+  restartPolicy: Never
+  wandbKey: nil
+  serviceAccountName: null
+  compile: 0
+  persistentVolumeClaims: null
+  volumes: null
+  git:
+    repo_url_or_path: https://github.com/example
+    branch: null
+    commit: null
+    token: null
+  pre_script: []
+  post_script: []
+  labelSelector:
+    required: null
+    preferred: null
+    weights: null
+  envVars:
+    NCCL_DEBUG: DEBUG
+    NEMO_LAUNCHER_DEBUG: 1
+    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+    FI_PROVIDER: efa
+    NCCL_SOCKET_IFNAME: ^lo,docker0
+    NCCL_IGNORE_DISABLED_P2P: '1'
+    TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
+    TORCH_DIST_INIT_BARRIER: '1'
+    CUDA_DEVICE_MAX_CONNECTIONS: '1'
diff --git a/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/test_custom_submission.sh b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/test_custom_submission.sh
new file mode 100644
index 0000000..8e2258d
--- /dev/null
+++ b/tests/k8s_workflow/k8s_baseline_artifacts/test_custom/test_custom_submission.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+helm install --timeout=15m --wait  --namespace default test-custom {$results_dir}/test_custom/k8s_template
diff --git a/tests/k8s_workflow/test_custom_k8s_workflow.py b/tests/k8s_workflow/test_custom_k8s_workflow.py
new file mode 100644
index 0000000..5db3f02
--- /dev/null
+++ b/tests/k8s_workflow/test_custom_k8s_workflow.py
@@ -0,0 +1,50 @@
+import logging
+
+from omegaconf import OmegaConf
+
+from main import main
+
+logger = logging.getLogger(__name__)
+
+from tests.test_utils import (
+    compare_artifacts,
+    create_temp_directory,
+    make_hydra_cfg_instance,
+)
+
+
+def compare_custom_k8s_artifacts(artifacts_dir):
+    logger.info("Comparing custom k8s artifacts")
+
+    artifacts_paths = [
+        "/test_custom/test_custom_submission.sh",
+        "/test_custom/k8s_template/Chart.yaml",
+        "/test_custom/k8s_template/values.yaml",
+        "/test_custom/k8s_template/templates/training.yaml",
+    ]
+
+    k8s_baseline_artifacts_path = "/tests/k8s_workflow/k8s_baseline_artifacts"
+    compare_artifacts(artifacts_paths, artifacts_dir, k8s_baseline_artifacts_path)
+
+
+def test_custom_k8s_workflow():
+    logger.info("Testing k8s workflow")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "training_cfg.entry_script=test.py",
+        "cluster.instance_type=p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "container=test_container",
+        "git.repo_url_or_path=https://github.com/example",
+        "+env_vars.NEMO_LAUNCHER_DEBUG=1",
+    ]
+
+    sample_custom_k8s_config = make_hydra_cfg_instance("../launcher_scripts/custom_script", "config_k8s", overrides)
+
+    logger.info("\nsample_custom_k8s_config\n")
+    logger.info(OmegaConf.to_yaml(sample_custom_k8s_config))
+
+    main(sample_custom_k8s_config)
+
+    compare_custom_k8s_artifacts(artifacts_dir)
diff --git a/tests/k8s_workflow/test_recipe_k8s_workflow.py b/tests/k8s_workflow/test_recipe_k8s_workflow.py
new file mode 100644
index 0000000..2609f5a
--- /dev/null
+++ b/tests/k8s_workflow/test_recipe_k8s_workflow.py
@@ -0,0 +1,82 @@
+import logging
+
+from omegaconf import OmegaConf
+
+from main import main
+
+logger = logging.getLogger(__name__)
+
+import pytest
+
+from tests.test_utils import (
+    compare_artifacts,
+    create_temp_directory,
+    make_hydra_cfg_instance,
+)
+
+
+def compare_recipe_k8s_artifacts(artifacts_dir):
+    logger.info("Comparing recipe k8s artifacts")
+
+    artifacts_paths = [
+        "/llama-8b/llama-8b_submission.sh",
+        # "/llama-8b/llama-8b_hydra.yaml", # Do not test recipe, this changes often
+        "/llama-8b/k8s_template/values.yaml",
+        "/llama-8b/k8s_template/Chart.yaml",
+        # "/llama-8b/k8s_template/config/llama-8b_hydra.yaml", # Do not test recipe, this changes often
+        "/llama-8b/k8s_template/templates/training.yaml",
+        "/llama-8b/k8s_template/templates/training-config.yaml",
+    ]
+
+    k8s_baseline_artifacts_path = "/tests/k8s_workflow/k8s_baseline_artifacts"
+    compare_artifacts(artifacts_paths, artifacts_dir, k8s_baseline_artifacts_path)
+
+
+def test_recipe_k8s_workflow():
+    logger.info("Testing recipe k8s workflow")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "instance_type=p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "container=test_container",
+        "cluster=k8s",
+        "cluster_type=k8s",
+        "+env_vars.NEMO_LAUNCHER_DEBUG=1",
+        "git.repo_url_or_path=https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git",
+        "git.branch=test_branch",
+        "git.commit=test_commit",
+        "git.token=test_token",
+    ]
+
+    sample_recipe_k8s_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_recipe_k8s_config\n")
+    logger.info(OmegaConf.to_yaml(sample_recipe_k8s_config))
+
+    main(sample_recipe_k8s_config)
+
+    compare_recipe_k8s_artifacts(artifacts_dir)
+
+
+def test_recipe_k8s_workflow_invalid():
+    logger.info("Testing recipe k8s workflow with invalid git config")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "instance_type=p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "container=test_container",
+        "cluster=k8s",
+        "cluster_type=k8s",
+        "+env_vars.NEMO_LAUNCHER_DEBUG=1",
+        "git.repo_url_or_path=/local/path",
+    ]
+
+    sample_recipe_k8s_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_recipe_k8s_config\n")
+    logger.info(OmegaConf.to_yaml(sample_recipe_k8s_config))
+
+    with pytest.raises(ValueError):
+        main(sample_recipe_k8s_config)
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/launch_docker_container.sh b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/launch_docker_container.sh
new file mode 100644
index 0000000..92b8486
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/launch_docker_container.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -ex
+echo "image is test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
+# Login ECR
+aws ecr get-login-password --region test_region | docker login --username AWS --password-stdin test_account.dkr.ecr.test_region.amazonaws.com
+
+# Getting EFA devices
+device=("--device=/dev/gdrdrv")
+while IFS= read -r -d '' d; do
+  device+=("--device=${d}")
+done < <(find "/dev/infiniband" -name "uverbs*" -print0)
+
+# Clean old containers
+docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
+docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true
+
+docker pull "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
+docker run --gpus 32 \
+  --privileged --rm -d --name "sm_training_launcher" \
+  --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \
+  --security-opt seccomp=unconfined  \
+  "${device[@]}" \
+  -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \
+  -v {$results_dir}:{$results_dir} \
+  test_docker_cmd \
+  "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" sleep infinity
+
+# Running post launching commands
+docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config"
+docker exec -itd "sm_training_launcher" bash -c "printf \"  Port 2022\n\" >> /root/.ssh/config"
+docker exec -itd "sm_training_launcher" bash -c "service ssh start"
+docker exec "sm_training_launcher" bash -c "test_post_launch_cmd"
+
+exit 0
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/llama-8b_hydra.yaml b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/llama-8b_hydra.yaml
new file mode 100644
index 0000000..ff38f37
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/llama-8b_hydra.yaml
@@ -0,0 +1,102 @@
+run:
+  name: llama-8b
+  results_dir: {$results_dir}/llama-8b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: true
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: true
+use_smp_model: true
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: true
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: false
+  activation_checkpointing: true
+  activation_loading_horizon: 2
+  delayed_param: true
+  offload_activations: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: false
+  fp8: true
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  max_context_width: 8192
+  max_position_embeddings: 8192
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: true
+  rope_theta: 500000.0
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+  do_finetune: false
+  hf_model_name_or_path: null
+  peft:
+    peft_type: null
+  precision: bf16
+  lr_decay_iters: 50
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1.0e-06
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/sagemaker-hf-llama3-8b_submission.sh b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/sagemaker-hf-llama3-8b_submission.sh
new file mode 100644
index 0000000..234234d
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/sagemaker-hf-llama3-8b_submission.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Parameters
+#SBATCH --exclusive
+#SBATCH --job-name=sagemaker-hf-llama3-8b
+#SBATCH --mem=0
+#SBATCH --nodes=4
+#SBATCH --output={$results_dir}/hf-llama3-8b/log-sagemaker-hf-llama3-8b_%j.out
+#SBATCH --time=6-00:00:00
+
+# setup
+export NCCL_DEBUG=WARN
+export FI_PROVIDER=efa
+export NCCL_SOCKET_IFNAME=^lo,docker0
+export NCCL_IGNORE_DISABLED_P2P=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_DIST_INIT_BARRIER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Prepare distributed files
+srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/hf-llama3-8b/hostname"
+
+srun -l bash {$results_dir}/hf-llama3-8b/launch_docker_container.sh
+srun -l bash {$results_dir}/hf-llama3-8b/docker_exec_script.sh
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/train_script.sh b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/train_script.sh
new file mode 100644
index 0000000..27108e4
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/hf-llama3-8b/train_script.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -ex
+export NCCL_DEBUG=WARN
+export FI_PROVIDER=efa
+export NCCL_SOCKET_IFNAME=^lo,docker0
+export NCCL_IGNORE_DISABLED_P2P=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_DIST_INIT_BARRIER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MASTER_ADDR=$(head -n 1 {$results_dir}/llama-8b/hostname)
+NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/llama-8b/hostname | cut -d ":" -f 1) - 1))
+NNODES=4
+PROCESSES_PER_NODE=8
+MASTER_PORT=41000
+
+DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d"
+
+# For greater env stability, grab hostname from `hostname`
+# https://sim.amazon.com/issues/P162624109
+LAUNCHER_HOSTNAME="$(hostname)"
+
+mkdir -p $HOME/tmp
+GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
+[[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
+git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR
+GIT_CLONE_DIR=${GIT_CLONE_DIR}/
+cd $GIT_CLONE_DIR
+
+unset SLURM_NTASKS
+
+torchrun $DISTRIBUTED_ARGS  \
+  examples/llama/llama_pretrain.py \
+  --config-path={$results_dir}/llama-8b --config-name=llama-8b_hydra.yaml
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/docker_exec_script.sh b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/docker_exec_script.sh
new file mode 100644
index 0000000..28d8be4
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/docker_exec_script.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -ex
+
+function job_epilogue {
+  docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
+}
+trap job_epilogue EXIT SIGTERM SIGINT
+
+docker exec sm_training_launcher bash {$results_dir}/llama-8b/train_script.sh
+
+exit 0
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/launch_docker_container.sh b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/launch_docker_container.sh
new file mode 100644
index 0000000..6db8ebb
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/launch_docker_container.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -ex
+echo "image is test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
+# Login ECR
+aws ecr get-login-password --region test_region | docker login --username AWS --password-stdin test_account.dkr.ecr.test_region.amazonaws.com
+
+# Getting EFA devices
+device=("--device=/dev/gdrdrv")
+while IFS= read -r -d '' d; do
+  device+=("--device=${d}")
+done < <(find "/dev/infiniband" -name "uverbs*" -print0)
+
+# Clean old containers
+docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
+docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true
+
+docker pull "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag"
+docker run --gpus 8 \
+  --privileged --rm -d --name "sm_training_launcher" \
+  --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \
+  --security-opt seccomp=unconfined  \
+  "${device[@]}" \
+  -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \
+  -v {$results_dir}:{$results_dir} \
+  test_docker_cmd \
+  "test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag" sleep infinity
+
+# Running post launching commands
+docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config"
+docker exec -itd "sm_training_launcher" bash -c "printf \"  Port 2022\n\" >> /root/.ssh/config"
+docker exec -itd "sm_training_launcher" bash -c "service ssh start"
+docker exec "sm_training_launcher" bash -c "test_post_launch_cmd"
+
+exit 0
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/llama-8b_hydra.yaml b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/llama-8b_hydra.yaml
new file mode 100644
index 0000000..ff38f37
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/llama-8b_hydra.yaml
@@ -0,0 +1,102 @@
+run:
+  name: llama-8b
+  results_dir: {$results_dir}/llama-8b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: true
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: true
+use_smp_model: true
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  train_batch_size: 2
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: true
+  tensor_model_parallel_degree: 1
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: false
+  activation_checkpointing: true
+  activation_loading_horizon: 2
+  delayed_param: true
+  offload_activations: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 8
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: false
+  fp8: true
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  max_context_width: 8192
+  max_position_embeddings: 8192
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: true
+  rope_theta: 500000.0
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+  do_finetune: false
+  hf_model_name_or_path: null
+  peft:
+    peft_type: null
+  precision: bf16
+  lr_decay_iters: 50
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1.0e-06
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/sagemaker-llama-8b_submission.sh b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/sagemaker-llama-8b_submission.sh
new file mode 100644
index 0000000..2d9341b
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/sagemaker-llama-8b_submission.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Parameters
+#SBATCH --exclusive
+#SBATCH --job-name=sagemaker-llama-8b
+#SBATCH --mem=0
+#SBATCH --nodes=16
+#SBATCH --output={$results_dir}/llama-8b/log-sagemaker-llama-8b_%j.out
+#SBATCH --time=6-00:00:00
+
+# setup
+export NCCL_DEBUG=WARN
+export FI_PROVIDER=efa
+export NCCL_SOCKET_IFNAME=^lo,docker0
+export NCCL_IGNORE_DISABLED_P2P=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_DIST_INIT_BARRIER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Prepare distributed files
+srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/llama-8b/hostname"
+
+srun -l bash {$results_dir}/llama-8b/launch_docker_container.sh
+srun -l bash {$results_dir}/llama-8b/docker_exec_script.sh
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/train_script.sh b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/train_script.sh
new file mode 100644
index 0000000..a2a3abd
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/llama-8b/train_script.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -ex
+export NCCL_DEBUG=WARN
+export FI_PROVIDER=efa
+export NCCL_SOCKET_IFNAME=^lo,docker0
+export NCCL_IGNORE_DISABLED_P2P=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_DIST_INIT_BARRIER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MASTER_ADDR=$(head -n 1 {$results_dir}/llama-8b/hostname)
+NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/llama-8b/hostname | cut -d ":" -f 1) - 1))
+NNODES=16
+PROCESSES_PER_NODE=8
+MASTER_PORT=41000
+
+DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d"
+
+# For greater env stability, grab hostname from `hostname`
+# https://sim.amazon.com/issues/P162624109
+LAUNCHER_HOSTNAME="$(hostname)"
+
+mkdir -p $HOME/tmp
+GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
+[[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
+git clone https://github.com/aws/sagemaker-hyperpod-training-adapter-for-nemo.git $GIT_CLONE_DIR
+GIT_CLONE_DIR=${GIT_CLONE_DIR}/
+cd $GIT_CLONE_DIR
+
+unset SLURM_NTASKS
+
+torchrun $DISTRIBUTED_ARGS  \
+  examples/llama/llama_pretrain.py \
+  --config-path={$results_dir}/llama-8b --config-name=llama-8b_hydra.yaml
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/docker_exec_script.sh b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/docker_exec_script.sh
new file mode 100644
index 0000000..10c61b3
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/docker_exec_script.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -ex
+
+function job_epilogue {
+  docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
+}
+trap job_epilogue EXIT SIGTERM SIGINT
+
+docker exec sm_training_launcher bash {$results_dir}/test_custom/train_script.sh
+
+exit 0
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/launch_docker_container.sh b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/launch_docker_container.sh
new file mode 100644
index 0000000..c1694ac
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/launch_docker_container.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -ex
+echo "image is test_container"
+echo "Not an ECR image, skipping ECR login"
+# Getting EFA devices
+device=("--device=/dev/gdrdrv")
+while IFS= read -r -d '' d; do
+  device+=("--device=${d}")
+done < <(find "/dev/infiniband" -name "uverbs*" -print0)
+
+# Clean old containers
+docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker rm -f {} > /dev/null 2>&1 || true
+docker ps -a --filter 'name=sm_training_launcher' --format '{{.ID}}' | xargs -I{} docker wait {} || true
+
+docker pull "test_container"
+docker run --gpus 8 \
+  --privileged --rm -d --name "sm_training_launcher" \
+  --uts=host --ulimit stack=67108864 --ulimit memlock=-1 --ipc=host --net=host \
+  --security-opt seccomp=unconfined  \
+  "${device[@]}" \
+  -v {$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts:{$workspace_dir}/launcher/nemo/nemo_framework_launcher/launcher_scripts \
+  -v {$results_dir}:{$results_dir} \
+  "test_container" sleep infinity
+
+# Running post launching commands
+docker exec -itd "sm_training_launcher" bash -c "printf \"Port 2022\n\" >> /etc/ssh/sshd_config"
+docker exec -itd "sm_training_launcher" bash -c "printf \"  Port 2022\n\" >> /root/.ssh/config"
+docker exec -itd "sm_training_launcher" bash -c "service ssh start"
+
+exit 0
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/testcustom_slurm_test_custom_submission.sh b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/testcustom_slurm_test_custom_submission.sh
new file mode 100644
index 0000000..ae6f970
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/testcustom_slurm_test_custom_submission.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Parameters
+#SBATCH --error={$results_dir}/test_custom/log-testcustom_slurm_test_custom_%j.err
+#SBATCH --exclusive
+#SBATCH --job-name=testcustom_slurm_test_custom
+#SBATCH --nodes=2
+#SBATCH --output={$results_dir}/test_custom/log-testcustom_slurm_test_custom_%j.out
+
+# setup
+export NCCL_DEBUG=DEBUG
+export FI_PROVIDER=efa
+export NCCL_SOCKET_IFNAME=^lo,docker0
+export NCCL_IGNORE_DISABLED_P2P=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_DIST_INIT_BARRIER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Prepare distributed files
+srun -l bash -c "scontrol show hostnames | sort > {$results_dir}/test_custom/hostname"
+
+srun -l bash {$results_dir}/test_custom/launch_docker_container.sh
+srun -l bash {$results_dir}/test_custom/docker_exec_script.sh
\ No newline at end of file
diff --git a/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/train_script.sh b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/train_script.sh
new file mode 100644
index 0000000..a2aaf52
--- /dev/null
+++ b/tests/slurm_workflow/slurm_baseline_artifacts/test_custom/train_script.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -ex
+export NCCL_DEBUG=DEBUG
+export FI_PROVIDER=efa
+export NCCL_SOCKET_IFNAME=^lo,docker0
+export NCCL_IGNORE_DISABLED_P2P=1
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export TORCH_DIST_INIT_BARRIER=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+MASTER_ADDR=$(head -n 1 {$results_dir}/test_custom/hostname)
+NODEID=$(($(grep -nx -o "\b$(hostname)\b" {$results_dir}/test_custom/hostname | cut -d ":" -f 1) - 1))
+NNODES=2
+PROCESSES_PER_NODE=8
+MASTER_PORT=41000
+
+DISTRIBUTED_ARGS="--nproc_per_node $PROCESSES_PER_NODE --nnodes $NNODES --rdzv_endpoint=$MASTER_ADDR --rdzv_id=100 --rdzv_backend=c10d"
+
+# For greater env stability, grab hostname from `hostname`
+# https://sim.amazon.com/issues/P162624109
+LAUNCHER_HOSTNAME="$(hostname)"
+
+mkdir -p $HOME/tmp
+GIT_CLONE_DIR="$HOME/tmp/$LAUNCHER_HOSTNAME"
+[[ -d $GIT_CLONE_DIR ]] && rm -rf $GIT_CLONE_DIR
+git clone https://github.com/example $GIT_CLONE_DIR
+GIT_CLONE_DIR=${GIT_CLONE_DIR}/
+cd $GIT_CLONE_DIR
+
+unset SLURM_NTASKS
+
+torchrun $DISTRIBUTED_ARGS  \
+  test.py \
+  
\ No newline at end of file
diff --git a/tests/slurm_workflow/test_custom_slurm_workflow.py b/tests/slurm_workflow/test_custom_slurm_workflow.py
new file mode 100644
index 0000000..344fe18
--- /dev/null
+++ b/tests/slurm_workflow/test_custom_slurm_workflow.py
@@ -0,0 +1,64 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import logging
+
+from omegaconf import OmegaConf
+
+from main import main
+
+logger = logging.getLogger(__name__)
+
+
+from tests.test_utils import (
+    compare_artifacts,
+    create_temp_directory,
+    make_hydra_cfg_instance,
+)
+
+
+def compare_custom_slurm_artifacts(artifacts_dir):
+    logger.info("Comparing custom slurm artifacts")
+
+    artifacts_paths = [
+        "/test_custom/launch_docker_container.sh",
+        "/test_custom/testcustom_slurm_test_custom_submission.sh",
+        "/test_custom/train_script.sh",
+        "/test_custom/docker_exec_script.sh",
+    ]
+    slurm_baseline_artifacts_path = "/tests/slurm_workflow/slurm_baseline_artifacts"
+    compare_artifacts(artifacts_paths, artifacts_dir, slurm_baseline_artifacts_path)
+
+
+def test_custom_slurm_workflow():
+    logger.info("Testing custom slurm workflow")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "training_cfg.entry_script=test.py",
+        "cluster.instance_type=p5.48xlarge",
+        "cluster.cluster_type=slurm",
+        "cluster.cluster_config.slurm_create_submission_file_only=True",
+        "git.repo_url_or_path=https://github.com/example",
+        "base_results_dir={}".format(artifacts_dir),
+        "container=test_container",
+    ]
+
+    sample_custom_slurm_config = make_hydra_cfg_instance("../launcher_scripts/custom_script", "config_slurm", overrides)
+
+    logger.info("\nsample_custom_slurm_config\n")
+    logger.info(OmegaConf.to_yaml(sample_custom_slurm_config))
+
+    main(sample_custom_slurm_config)
+
+    compare_custom_slurm_artifacts(artifacts_dir)
diff --git a/tests/slurm_workflow/test_recipe_slurm_workflow.py b/tests/slurm_workflow/test_recipe_slurm_workflow.py
new file mode 100644
index 0000000..47b1a3b
--- /dev/null
+++ b/tests/slurm_workflow/test_recipe_slurm_workflow.py
@@ -0,0 +1,107 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import logging
+
+from omegaconf import OmegaConf
+
+from main import main
+
+logger = logging.getLogger(__name__)
+
+
+from tests.test_utils import (
+    compare_artifacts,
+    create_temp_directory,
+    make_hydra_cfg_instance,
+)
+
+
+def compare_recipe_slurm_artifacts(artifacts_dir):
+    logger.info("Comparing recipe slurm artifacts")
+
+    artifacts_paths = [
+        "/llama-8b/launch_docker_container.sh",
+        # "/llama-8b/llama-8b_hydra.yaml", # Do not test the recipe, this is changing often
+        "/llama-8b/sagemaker-llama-8b_submission.sh",
+        "/llama-8b/train_script.sh",
+        "/llama-8b/docker_exec_script.sh",
+    ]
+
+    slurm_baseline_artifacts_path = "/tests/slurm_workflow/slurm_baseline_artifacts"
+    compare_artifacts(artifacts_paths, artifacts_dir, slurm_baseline_artifacts_path)
+
+
+def compare_recipe_slurm_artifacts_trn(artifacts_dir):
+    logger.info("Comparing recipe slurm artifacts")
+
+    artifacts_paths = [
+        "/hf-llama3-8b/launch_docker_container.sh",
+        # "/llama-8b/llama-8b_hydra.yaml", # Do not test the recipe, this is changing often
+        "/hf-llama3-8b/sagemaker-hf-llama3-8b_submission.sh",
+        # "/llama-7b/train_script.sh",
+    ]
+
+    slurm_baseline_artifacts_path = "/tests/slurm_workflow/slurm_baseline_artifacts"
+    compare_artifacts(artifacts_paths, artifacts_dir, slurm_baseline_artifacts_path)
+
+
+def test_recipe_slurm_workflow():
+    logger.info("Testing recipe slurm workflow")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "instance_type=ml.p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "container=test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag",
+        "cluster.slurm_create_submission_file_only=True",
+        "cluster.slurm_docker_cfg.docker_args=[test_docker_cmd]",
+        "cluster.slurm_docker_cfg.post_launch_commands=[test_post_launch_cmd]",
+    ]
+
+    sample_recipe_slurm_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_recipe_slurm_config\n")
+    logger.info(OmegaConf.to_yaml(sample_recipe_slurm_config))
+
+    main(sample_recipe_slurm_config)
+
+    compare_recipe_slurm_artifacts(artifacts_dir)
+
+
+def test_recipe_slurm_trn_workflow():
+    logger.info("Testing recipe slurm workflow for trn")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "instance_type=trn1.32xlarge",
+        "recipes=training/llama/hf_llama3_8b_seq8k_trn1x4_pretrain.yaml",
+        "base_results_dir={}".format(artifacts_dir),
+        "container=test_account.dkr.ecr.test_region.amazonaws.com/test_repo:test_tag",
+        "cluster.slurm_create_submission_file_only=True",
+        "cluster.slurm_docker_cfg.docker_args=[test_docker_cmd]",
+        "cluster.slurm_docker_cfg.post_launch_commands=[test_post_launch_cmd]",
+        "recipes.run.name=hf-llama3-8b",
+        "recipes.trainer.num_nodes=4",
+        "recipes.data.train_dir=/fake_dataset",
+        "recipes.model.model_config=/fake_dataset/config.json",
+    ]
+
+    sample_recipe_slurm_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_recipe_slurm_config\n")
+    logger.info(OmegaConf.to_yaml(sample_recipe_slurm_config))
+
+    main(sample_recipe_slurm_config)
+
+    compare_recipe_slurm_artifacts_trn(artifacts_dir)
diff --git a/tests/sm_jobs_workflow/__init__.py b/tests/sm_jobs_workflow/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/launch.py b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/launch.py
new file mode 100644
index 0000000..7d7eccf
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/launch.py
@@ -0,0 +1,140 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import argparse
+import logging
+import os
+
+import omegaconf
+import sagemaker
+from omegaconf import OmegaConf
+from sagemaker.debugger import TensorBoardOutputConfig
+from sagemaker.inputs import FileSystemInput
+from sagemaker.interactive_apps import SupportedInteractiveAppTypes
+from sagemaker.pytorch import PyTorch
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    script_dir = os.path.dirname(os.path.join(os.path.realpath(__file__)))
+    parser = argparse.ArgumentParser(description="Launch training recipe using SM jobs")
+    parser.add_argument(
+        "--recipe", type=str, default=os.path.join(script_dir, "recipe.yaml"), help="Path to recipe config."
+    )
+    parser.add_argument(
+        "--sm_jobs_config",
+        type=str,
+        default=os.path.join(script_dir, "sm_jobs_config.yaml"),
+        help="Path to sm jobs config.",
+    )
+    parser.add_argument("--job_name", type=str, required=True, help="Job name for the SDK job.")
+    parser.add_argument("--instance_type", type=str, required=True, help="Instance type to use for the training job.")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    sagemaker_session = sagemaker.Session()
+    role = sagemaker.get_execution_role()
+
+    sm_jobs_config = OmegaConf.load(args.sm_jobs_config)
+    recipe_overrides = sm_jobs_config.get("recipe_overrides", omegaconf.DictConfig(dict()))
+    recipe = OmegaConf.load(args.recipe)
+    recipe = OmegaConf.merge(recipe, recipe_overrides)
+    recipe_overrides = OmegaConf.to_container(recipe_overrides)
+
+    sm_inputs = sm_jobs_config.get("inputs")
+    inputs = None
+    if sm_inputs:
+        s3 = sm_inputs.get("s3")
+        file_system = sm_inputs.get("file_system")
+        if s3 and file_system:
+            raise ValueError("Must set only one of s3 or file_system in sm_jobs_config.inputs.")
+        if s3 is None and file_system is None:
+            raise ValueError("Must set either s3 or file_system in sm_jobs_config.inputs.")
+        if s3:
+            inputs = OmegaConf.to_container(s3)
+        else:
+            file_system_id = file_system.get("id")
+            file_system_type = file_system.get("type")
+            directory_path = file_system.get("directory_path")
+            if file_system_id is None or file_system_type is None or directory_path is None:
+                raise ValueError("Must set id, type and directory_path for file_system input type in sm_jobs_config.")
+            inputs = FileSystemInput(
+                file_system_id=file_system_id,
+                file_system_type=file_system_type,
+                directory_path=directory_path,
+                file_system_access_mode="ro",
+            )
+
+    output_path = sm_jobs_config.get("output_path")
+    if output_path is None:
+        raise ValueError("Expected output_path to be set with sm_jobs cluster type")
+
+    additional_estimator_kwargs = sm_jobs_config.get("additional_estimator_kwargs", omegaconf.DictConfig(dict()))
+    additional_estimator_kwargs = OmegaConf.to_container(additional_estimator_kwargs)
+
+    tensorboard_config = sm_jobs_config.get("tensorboard_config")
+    if tensorboard_config:
+        tb_output_path = tensorboard_config.get("output_path")
+        tb_container_path = tensorboard_config.get("container_logs_path")
+        if tb_output_path is None or tb_container_path is None:
+            raise ValueError("Please set output path and container path when using tensorboard.")
+        tensorboard_output_config = TensorBoardOutputConfig(
+            s3_output_path=tb_output_path, container_local_output_path=tb_container_path
+        )
+        additional_estimator_kwargs["tensorboard_output_config"] = tensorboard_output_config
+        if recipe.get("exp_manager") is None or recipe.get("exp_manager", dict()).get("explicit_log_dir") is None:
+            logger.warning("Using tensorboard but not set exp_manager -> explicit_log_dir for recipe.")
+
+    base_job_name = args.job_name.replace(".", "-")
+    base_job_name = base_job_name.replace("_", "-")
+    estimator = PyTorch(
+        base_job_name=base_job_name,
+        instance_type=args.instance_type,
+        training_recipe=args.recipe,
+        recipe_overrides=recipe_overrides,
+        output_path=output_path,
+        role=role,
+        sagemaker_session=sagemaker_session,
+        **additional_estimator_kwargs,
+    )
+
+    if tensorboard_config:
+        logger.info("Tensorboard url:")
+        logger.info(
+            estimator.get_app_url(
+                app_type=SupportedInteractiveAppTypes.TENSORBOARD,
+                open_in_default_web_browser=False,
+            )
+        )
+
+    if not isinstance(inputs, FileSystemInput):
+        keys_to_pop = []
+        for item in inputs.keys():
+            if not inputs[item]:
+                print(f"poping input {inputs[item]}, {item}")
+                keys_to_pop.append(item)
+        for item in keys_to_pop:
+            inputs.pop(item)
+        if len(inputs) == 0:
+            inputs = None
+
+    estimator.fit(inputs=inputs, wait=sm_jobs_config.get("wait", False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_hydra.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_hydra.yaml
new file mode 100644
index 0000000..95e3811
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_hydra.yaml
@@ -0,0 +1,81 @@
+run:
+  name: llama3.2-11b
+  results_dir: {$results_dir}/llama3.2-11b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: false
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: false
+use_smp_model: false
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  do_finetune: false
+  hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
+  hf_access_token: null
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  use_flash_attention: true
+  activation_checkpointing: true
+  multi_modal: true
+  delayed_param: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 32
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: false
+  max_context_width: 8192
+  precision: bf16
+  lr_decay_iters: 47683
+  log_reduced_training_loss: true
+  peft:
+    peft_type: null
+  optim:
+    name: adamw
+    lr: 0.0002
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 2.0e-05
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+    tokenizer_name: null
+    zipped_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_submission.sh b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_submission.sh
new file mode 100644
index 0000000..469d0ec
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/llama3.2-11b_submission.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+pushd $(dirname -- $0)
+python launch.py --job_name llama3.2-11b --instance_type p5.48xlarge
+popd
\ No newline at end of file
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/recipe.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/recipe.yaml
new file mode 100644
index 0000000..7791a09
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/recipe.yaml
@@ -0,0 +1,80 @@
+run:
+  name: llama3.2-11b
+  results_dir: /var/folders/6w/nm79zb595ll18wyj6czl6gfm0000gq/T/tmp1nal2g5n/llama3.2-11b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 4
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 100
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: false
+use_smp_model: false
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  do_finetune: false
+  hf_model_name_or_path: meta-llama/Llama-3.2-11B-Vision-Instruct
+  hf_access_token: null
+  train_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  use_flash_attention: true
+  activation_checkpointing: true
+  multi_modal: true
+  delayed_param: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 32
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: false
+  use_orig_param: false
+  max_context_width: 8192
+  precision: bf16
+  lr_decay_iters: 47683
+  log_reduced_training_loss: true
+  peft:
+    peft_type: null
+  optim:
+    name: adamw
+    lr: 0.0002
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 2.0e-05
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+    tokenizer_name: null
+    zipped_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/requirements.txt b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/requirements.txt
new file mode 100644
index 0000000..e488add
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/requirements.txt
@@ -0,0 +1 @@
+transformers==4.45.2
\ No newline at end of file
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/sm_jobs_config.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/sm_jobs_config.yaml
new file mode 100644
index 0000000..fb2ab8f
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/multimodal/llama3.2-11b/sm_jobs_config.yaml
@@ -0,0 +1,17 @@
+output_path: s3://test_path
+tensorboard_config:
+  output_path: s3://test_tensorboard_path
+  container_logs_path: /opt/ml/output/tensorboard
+wait: true
+inputs:
+  s3:
+    train: null
+    val: null
+  file_system:
+    id: null
+    type: null
+    directory_path: null
+additional_estimator_kwargs:
+  max_run: 1800
+  enable_remote_debug: true
+recipe_overrides: null
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/launch.py b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/launch.py
new file mode 100644
index 0000000..7d7eccf
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/launch.py
@@ -0,0 +1,140 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import argparse
+import logging
+import os
+
+import omegaconf
+import sagemaker
+from omegaconf import OmegaConf
+from sagemaker.debugger import TensorBoardOutputConfig
+from sagemaker.inputs import FileSystemInput
+from sagemaker.interactive_apps import SupportedInteractiveAppTypes
+from sagemaker.pytorch import PyTorch
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    script_dir = os.path.dirname(os.path.join(os.path.realpath(__file__)))
+    parser = argparse.ArgumentParser(description="Launch training recipe using SM jobs")
+    parser.add_argument(
+        "--recipe", type=str, default=os.path.join(script_dir, "recipe.yaml"), help="Path to recipe config."
+    )
+    parser.add_argument(
+        "--sm_jobs_config",
+        type=str,
+        default=os.path.join(script_dir, "sm_jobs_config.yaml"),
+        help="Path to sm jobs config.",
+    )
+    parser.add_argument("--job_name", type=str, required=True, help="Job name for the SDK job.")
+    parser.add_argument("--instance_type", type=str, required=True, help="Instance type to use for the training job.")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    sagemaker_session = sagemaker.Session()
+    role = sagemaker.get_execution_role()
+
+    sm_jobs_config = OmegaConf.load(args.sm_jobs_config)
+    recipe_overrides = sm_jobs_config.get("recipe_overrides", omegaconf.DictConfig(dict()))
+    recipe = OmegaConf.load(args.recipe)
+    recipe = OmegaConf.merge(recipe, recipe_overrides)
+    recipe_overrides = OmegaConf.to_container(recipe_overrides)
+
+    sm_inputs = sm_jobs_config.get("inputs")
+    inputs = None
+    if sm_inputs:
+        s3 = sm_inputs.get("s3")
+        file_system = sm_inputs.get("file_system")
+        if s3 and file_system:
+            raise ValueError("Must set only one of s3 or file_system in sm_jobs_config.inputs.")
+        if s3 is None and file_system is None:
+            raise ValueError("Must set either s3 or file_system in sm_jobs_config.inputs.")
+        if s3:
+            inputs = OmegaConf.to_container(s3)
+        else:
+            file_system_id = file_system.get("id")
+            file_system_type = file_system.get("type")
+            directory_path = file_system.get("directory_path")
+            if file_system_id is None or file_system_type is None or directory_path is None:
+                raise ValueError("Must set id, type and directory_path for file_system input type in sm_jobs_config.")
+            inputs = FileSystemInput(
+                file_system_id=file_system_id,
+                file_system_type=file_system_type,
+                directory_path=directory_path,
+                file_system_access_mode="ro",
+            )
+
+    output_path = sm_jobs_config.get("output_path")
+    if output_path is None:
+        raise ValueError("Expected output_path to be set with sm_jobs cluster type")
+
+    additional_estimator_kwargs = sm_jobs_config.get("additional_estimator_kwargs", omegaconf.DictConfig(dict()))
+    additional_estimator_kwargs = OmegaConf.to_container(additional_estimator_kwargs)
+
+    tensorboard_config = sm_jobs_config.get("tensorboard_config")
+    if tensorboard_config:
+        tb_output_path = tensorboard_config.get("output_path")
+        tb_container_path = tensorboard_config.get("container_logs_path")
+        if tb_output_path is None or tb_container_path is None:
+            raise ValueError("Please set output path and container path when using tensorboard.")
+        tensorboard_output_config = TensorBoardOutputConfig(
+            s3_output_path=tb_output_path, container_local_output_path=tb_container_path
+        )
+        additional_estimator_kwargs["tensorboard_output_config"] = tensorboard_output_config
+        if recipe.get("exp_manager") is None or recipe.get("exp_manager", dict()).get("explicit_log_dir") is None:
+            logger.warning("Using tensorboard but not set exp_manager -> explicit_log_dir for recipe.")
+
+    base_job_name = args.job_name.replace(".", "-")
+    base_job_name = base_job_name.replace("_", "-")
+    estimator = PyTorch(
+        base_job_name=base_job_name,
+        instance_type=args.instance_type,
+        training_recipe=args.recipe,
+        recipe_overrides=recipe_overrides,
+        output_path=output_path,
+        role=role,
+        sagemaker_session=sagemaker_session,
+        **additional_estimator_kwargs,
+    )
+
+    if tensorboard_config:
+        logger.info("Tensorboard url:")
+        logger.info(
+            estimator.get_app_url(
+                app_type=SupportedInteractiveAppTypes.TENSORBOARD,
+                open_in_default_web_browser=False,
+            )
+        )
+
+    if not isinstance(inputs, FileSystemInput):
+        keys_to_pop = []
+        for item in inputs.keys():
+            if not inputs[item]:
+                print(f"poping input {inputs[item]}, {item}")
+                keys_to_pop.append(item)
+        for item in keys_to_pop:
+            inputs.pop(item)
+        if len(inputs) == 0:
+            inputs = None
+
+    estimator.fit(inputs=inputs, wait=sm_jobs_config.get("wait", False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_hydra.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_hydra.yaml
new file mode 100644
index 0000000..777a6b0
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_hydra.yaml
@@ -0,0 +1,102 @@
+run:
+  name: llama-8b
+  results_dir: {$results_dir}/llama-8b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: false
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: true
+use_smp_model: true
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: true
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: false
+  activation_checkpointing: false
+  activation_loading_horizon: 1
+  delayed_param: true
+  offload_activations: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: true
+  fp8: true
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  max_context_width: 16384
+  max_position_embeddings: 16384
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: true
+  rope_theta: 500000.0
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+  do_finetune: false
+  hf_model_name_or_path: null
+  peft:
+    peft_type: null
+  precision: bf16
+  lr_decay_iters: 50
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1.0e-06
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_submission.sh b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_submission.sh
new file mode 100644
index 0000000..062fc12
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/llama-8b_submission.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+pushd $(dirname -- $0)
+python launch.py --job_name llama-8b --instance_type p5.48xlarge
+popd
\ No newline at end of file
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/sm_jobs_config.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/sm_jobs_config.yaml
new file mode 100644
index 0000000..fb2ab8f
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/no_kwargs/llama-8b/sm_jobs_config.yaml
@@ -0,0 +1,17 @@
+output_path: s3://test_path
+tensorboard_config:
+  output_path: s3://test_tensorboard_path
+  container_logs_path: /opt/ml/output/tensorboard
+wait: true
+inputs:
+  s3:
+    train: null
+    val: null
+  file_system:
+    id: null
+    type: null
+    directory_path: null
+additional_estimator_kwargs:
+  max_run: 1800
+  enable_remote_debug: true
+recipe_overrides: null
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/launch.py b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/launch.py
new file mode 100644
index 0000000..7d7eccf
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/launch.py
@@ -0,0 +1,140 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+import argparse
+import logging
+import os
+
+import omegaconf
+import sagemaker
+from omegaconf import OmegaConf
+from sagemaker.debugger import TensorBoardOutputConfig
+from sagemaker.inputs import FileSystemInput
+from sagemaker.interactive_apps import SupportedInteractiveAppTypes
+from sagemaker.pytorch import PyTorch
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    script_dir = os.path.dirname(os.path.join(os.path.realpath(__file__)))
+    parser = argparse.ArgumentParser(description="Launch training recipe using SM jobs")
+    parser.add_argument(
+        "--recipe", type=str, default=os.path.join(script_dir, "recipe.yaml"), help="Path to recipe config."
+    )
+    parser.add_argument(
+        "--sm_jobs_config",
+        type=str,
+        default=os.path.join(script_dir, "sm_jobs_config.yaml"),
+        help="Path to sm jobs config.",
+    )
+    parser.add_argument("--job_name", type=str, required=True, help="Job name for the SDK job.")
+    parser.add_argument("--instance_type", type=str, required=True, help="Instance type to use for the training job.")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    sagemaker_session = sagemaker.Session()
+    role = sagemaker.get_execution_role()
+
+    sm_jobs_config = OmegaConf.load(args.sm_jobs_config)
+    recipe_overrides = sm_jobs_config.get("recipe_overrides", omegaconf.DictConfig(dict()))
+    recipe = OmegaConf.load(args.recipe)
+    recipe = OmegaConf.merge(recipe, recipe_overrides)
+    recipe_overrides = OmegaConf.to_container(recipe_overrides)
+
+    sm_inputs = sm_jobs_config.get("inputs")
+    inputs = None
+    if sm_inputs:
+        s3 = sm_inputs.get("s3")
+        file_system = sm_inputs.get("file_system")
+        if s3 and file_system:
+            raise ValueError("Must set only one of s3 or file_system in sm_jobs_config.inputs.")
+        if s3 is None and file_system is None:
+            raise ValueError("Must set either s3 or file_system in sm_jobs_config.inputs.")
+        if s3:
+            inputs = OmegaConf.to_container(s3)
+        else:
+            file_system_id = file_system.get("id")
+            file_system_type = file_system.get("type")
+            directory_path = file_system.get("directory_path")
+            if file_system_id is None or file_system_type is None or directory_path is None:
+                raise ValueError("Must set id, type and directory_path for file_system input type in sm_jobs_config.")
+            inputs = FileSystemInput(
+                file_system_id=file_system_id,
+                file_system_type=file_system_type,
+                directory_path=directory_path,
+                file_system_access_mode="ro",
+            )
+
+    output_path = sm_jobs_config.get("output_path")
+    if output_path is None:
+        raise ValueError("Expected output_path to be set with sm_jobs cluster type")
+
+    additional_estimator_kwargs = sm_jobs_config.get("additional_estimator_kwargs", omegaconf.DictConfig(dict()))
+    additional_estimator_kwargs = OmegaConf.to_container(additional_estimator_kwargs)
+
+    tensorboard_config = sm_jobs_config.get("tensorboard_config")
+    if tensorboard_config:
+        tb_output_path = tensorboard_config.get("output_path")
+        tb_container_path = tensorboard_config.get("container_logs_path")
+        if tb_output_path is None or tb_container_path is None:
+            raise ValueError("Please set output path and container path when using tensorboard.")
+        tensorboard_output_config = TensorBoardOutputConfig(
+            s3_output_path=tb_output_path, container_local_output_path=tb_container_path
+        )
+        additional_estimator_kwargs["tensorboard_output_config"] = tensorboard_output_config
+        if recipe.get("exp_manager") is None or recipe.get("exp_manager", dict()).get("explicit_log_dir") is None:
+            logger.warning("Using tensorboard but not set exp_manager -> explicit_log_dir for recipe.")
+
+    base_job_name = args.job_name.replace(".", "-")
+    base_job_name = base_job_name.replace("_", "-")
+    estimator = PyTorch(
+        base_job_name=base_job_name,
+        instance_type=args.instance_type,
+        training_recipe=args.recipe,
+        recipe_overrides=recipe_overrides,
+        output_path=output_path,
+        role=role,
+        sagemaker_session=sagemaker_session,
+        **additional_estimator_kwargs,
+    )
+
+    if tensorboard_config:
+        logger.info("Tensorboard url:")
+        logger.info(
+            estimator.get_app_url(
+                app_type=SupportedInteractiveAppTypes.TENSORBOARD,
+                open_in_default_web_browser=False,
+            )
+        )
+
+    if not isinstance(inputs, FileSystemInput):
+        keys_to_pop = []
+        for item in inputs.keys():
+            if not inputs[item]:
+                print(f"poping input {inputs[item]}, {item}")
+                keys_to_pop.append(item)
+        for item in keys_to_pop:
+            inputs.pop(item)
+        if len(inputs) == 0:
+            inputs = None
+
+    estimator.fit(inputs=inputs, wait=sm_jobs_config.get("wait", False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_hydra.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_hydra.yaml
new file mode 100644
index 0000000..777a6b0
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_hydra.yaml
@@ -0,0 +1,102 @@
+run:
+  name: llama-8b
+  results_dir: {$results_dir}/llama-8b
+  time_limit: 6-00:00:00
+  model_type: hf
+trainer:
+  devices: 8
+  num_nodes: 16
+  accelerator: gpu
+  precision: bf16
+  max_steps: 50
+  log_every_n_steps: 1
+  val_check_interval: 1
+  limit_val_batches: 0
+exp_manager:
+  exp_dir: null
+  name: experiment
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    save_top_k: 0
+    every_n_train_steps: 10
+    monitor: step
+    mode: max
+    save_last: false
+  checkpoint_dir: None/checkpoints/
+  resume_from_checkpoint: null
+  auto_checkpoint:
+    enabled: false
+  export_full_model:
+    every_n_train_steps: 0
+    save_last: true
+use_smp_model: true
+distributed_backend: nccl
+model:
+  model_type: llama_v3
+  train_batch_size: 1
+  val_batch_size: 1
+  seed: 12345
+  grad_clip: 1.0
+  log_reduced_training_loss: true
+  tensor_model_parallel_degree: 2
+  expert_model_parallel_degree: 1
+  context_parallel_degree: 1
+  moe: false
+  activation_checkpointing: false
+  activation_loading_horizon: 1
+  delayed_param: true
+  offload_activations: false
+  sharding_strategy: hybrid_shard
+  forward_prefetch: true
+  shard_degree: 64
+  backward_fetch_policy: backward_pre
+  auto_wrap_policy: transformer_auto_wrap_policy
+  limit_all_gathers: true
+  use_orig_param: true
+  fp8: true
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  max_context_width: 16384
+  max_position_embeddings: 16384
+  num_hidden_layers: 32
+  hidden_size: 4096
+  num_attention_heads: 32
+  intermediate_size: 14336
+  initializer_range: 0.02
+  layernorm_epsilon: 1.0e-05
+  vocab_size: 128256
+  num_key_value_heads: 8
+  use_flash_attention: true
+  rope_theta: 500000.0
+  rope_scaling:
+    rope_type: llama3
+    factor: 8.0
+    high_freq_factor: 4.0
+    low_freq_factor: 1.0
+    original_max_position_embeddings: 8192
+  do_finetune: false
+  hf_model_name_or_path: null
+  peft:
+    peft_type: null
+  precision: bf16
+  lr_decay_iters: 50
+  optim:
+    name: adamw
+    lr: 0.0001
+    weight_decay: 0.01
+    betas:
+    - 0.9
+    - 0.95
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 0
+      constant_steps: 0
+      min_lr: 1.0e-06
+  data:
+    train_dir: null
+    val_dir: null
+    dataset_type: hf
+    use_synthetic_data: false
+  viztracer:
+    enabled: false
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_submission.sh b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_submission.sh
new file mode 100644
index 0000000..062fc12
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/llama-8b_submission.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+pushd $(dirname -- $0)
+python launch.py --job_name llama-8b --instance_type p5.48xlarge
+popd
\ No newline at end of file
diff --git a/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/sm_jobs_config.yaml b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/sm_jobs_config.yaml
new file mode 100644
index 0000000..677966f
--- /dev/null
+++ b/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/with_kwargs/llama-8b/sm_jobs_config.yaml
@@ -0,0 +1,17 @@
+output_path: s3://test_path
+tensorboard_config:
+  output_path: null
+  container_logs_path: null
+wait: true
+inputs:
+  s3:
+    train: s3://test_path
+    val: s3://test_path
+  file_system:
+    id: null
+    type: null
+    directory_path: null
+additional_estimator_kwargs:
+  max_run: 1800
+  enable_remote_debug: true
+recipe_overrides: null
diff --git a/tests/sm_jobs_workflow/test_sm_jobs_workflow.py b/tests/sm_jobs_workflow/test_sm_jobs_workflow.py
new file mode 100644
index 0000000..c7d5f11
--- /dev/null
+++ b/tests/sm_jobs_workflow/test_sm_jobs_workflow.py
@@ -0,0 +1,124 @@
+import logging
+import os
+
+os.environ["NEMO_LAUNCHER_DEBUG"] = "1"
+
+from omegaconf import OmegaConf
+
+from main import main
+
+logger = logging.getLogger(__name__)
+
+
+from tests.test_utils import (
+    compare_artifacts,
+    create_temp_directory,
+    make_hydra_cfg_instance,
+)
+
+
+def compare_sm_jobs_common_artifacts(artifacts_dir, prefix, baseline_artifacts_subdir):
+    logger.info("Comparing sm_jobs common artifacts")
+
+    artifacts_paths = [
+        f"/{prefix}/{prefix}_submission.sh",
+        f"/{prefix}/{prefix}_hydra.yaml",
+        f"/{prefix}/sm_jobs_config.yaml",
+        f"/{prefix}/launch.py",
+    ]
+    baseline_artifacts_dir = "/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/" + baseline_artifacts_subdir
+    compare_artifacts(artifacts_paths, artifacts_dir, baseline_artifacts_dir)
+
+
+def is_requirements_file(artifacts_dir, prefix, baseline_artifacts_subdir, reqs):
+    logger.info("Checking sm_jobs requirements")
+
+    artifacts_paths = [f"/{prefix}/requirements.txt"]
+    if not reqs:
+        reqs_file = artifacts_dir + artifacts_paths[0]
+        assert not (os.path.exists(reqs_file))
+    else:
+        baseline_artifacts_dir = "/tests/sm_jobs_workflow/sm_jobs_baseline_artifacts/" + baseline_artifacts_subdir
+        compare_artifacts(artifacts_paths, artifacts_dir, baseline_artifacts_dir)
+
+
+def test_sm_jobs_workflow_no_additional_kwargs():
+    logger.info("Testing sm_jobs workflow without additional kwargs")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "cluster=sm_jobs",
+        "cluster_type=sm_jobs",
+        "instance_type=p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "+cluster.sm_jobs_config.output_path=s3://test_path",
+        "+cluster.sm_jobs_config.tensorboard_config.output_path=s3://test_tensorboard_path",
+        "+cluster.sm_jobs_config.tensorboard_config.container_logs_path=/opt/ml/output/tensorboard",
+        "container=test_container",
+        "+env_vars.NEMO_LAUNCHER_DEBUG=1",
+    ]
+
+    sample_sm_jobs_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_sm_jobs_config\n")
+    logger.info(OmegaConf.to_yaml(sample_sm_jobs_config))
+
+    main(sample_sm_jobs_config)
+
+    compare_sm_jobs_common_artifacts(artifacts_dir, "llama-8b", "no_kwargs")
+    is_requirements_file(artifacts_dir, "llama-8b", "no_kwargs", False)
+
+
+def test_sm_jobs_workflow_with_additional_kwargs():
+    logger.info("Testing sm_jobs workflow with additional kwargs")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "cluster=sm_jobs",
+        "cluster_type=sm_jobs",
+        "instance_type=p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "+cluster.sm_jobs_config.output_path=s3://test_path",
+        "+cluster.sm_jobs_config.inputs.s3.train=s3://test_path",
+        "+cluster.sm_jobs_config.inputs.s3.val=s3://test_path",
+        "container=test_container",
+        "+env_vars.NEMO_LAUNCHER_DEBUG=1",
+    ]
+
+    sample_sm_jobs_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_sm_jobs_config\n")
+    logger.info(OmegaConf.to_yaml(sample_sm_jobs_config))
+
+    main(sample_sm_jobs_config)
+
+    compare_sm_jobs_common_artifacts(artifacts_dir, "llama-8b", "with_kwargs")
+    is_requirements_file(artifacts_dir, "llama-8b", "with_kwargs", False)
+
+
+def test_sm_jobs_workflow_multimodal():
+    logger.info("Testing sm_jobs workflow for multi-modal")
+
+    artifacts_dir = create_temp_directory()
+    overrides = [
+        "recipes=training/llama/hf_llama3_2_11b_seq8k_gpu_p5x4_pretrain",
+        "cluster=sm_jobs",
+        "cluster_type=sm_jobs",
+        "instance_type=p5.48xlarge",
+        "base_results_dir={}".format(artifacts_dir),
+        "+cluster.sm_jobs_config.output_path=s3://test_path",
+        "+cluster.sm_jobs_config.tensorboard_config.output_path=s3://test_tensorboard_path",
+        "+cluster.sm_jobs_config.tensorboard_config.container_logs_path=/opt/ml/output/tensorboard",
+        "container=test_container",
+        "+env_vars.NEMO_LAUNCHER_DEBUG=1",
+    ]
+
+    sample_sm_jobs_config = make_hydra_cfg_instance("../recipes_collection", "config", overrides)
+
+    logger.info("\nsample_sm_jobs_config\n")
+    logger.info(OmegaConf.to_yaml(sample_sm_jobs_config))
+
+    main(sample_sm_jobs_config)
+
+    compare_sm_jobs_common_artifacts(artifacts_dir, "llama3.2-11b", "multimodal")
+    is_requirements_file(artifacts_dir, "llama3.2-11b", "multimodal", True)
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..54a213e
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,83 @@
+import difflib
+import logging
+import os
+import shutil
+import stat
+import tempfile
+
+from hydra import compose, initialize
+from hydra.core.hydra_config import HydraConfig
+
+logger = logging.getLogger(__name__)
+
+
+def create_temp_directory():
+    """Create a temporary directory and Set full permissions for the directory"""
+    temp_dir = tempfile.mkdtemp()
+    os.chmod(temp_dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
+    return temp_dir
+
+
+def replace_placeholder(file_path, placeholder, replacement):
+    """Replace occurrences of placeholder in a file with the given replacement."""
+    with open(file_path, "r") as file:
+        content = file.read()
+
+    content = content.replace(placeholder, replacement)
+
+    with open(file_path, "w") as file:
+        file.write(content)
+
+
+def compare_artifacts(artifacts_paths, artifacts_dir, baseline_artifacts_path):
+    for artifact_path in artifacts_paths:
+        current_dir = os.getcwd()
+        actual_artifact_path = artifacts_dir + artifact_path
+        baseline_artifact_folder = current_dir + baseline_artifacts_path
+
+        # Make a copy of baseline artifacts to replace placeholders
+        baseline_artifact_copy_folder = create_temp_directory()
+        shutil.copytree(baseline_artifact_folder, baseline_artifact_copy_folder, dirs_exist_ok=True)
+        baseline_artifact_path = baseline_artifact_copy_folder + artifact_path
+
+        results_dir_placeholder = "{$results_dir}"
+        replace_placeholder(baseline_artifact_path, results_dir_placeholder, artifacts_dir)
+        workspace_dir_placeholder = "{$workspace_dir}"
+        replace_placeholder(baseline_artifact_path, workspace_dir_placeholder, current_dir)
+
+        comparison_result = compare_files(baseline_artifact_path, actual_artifact_path)
+        if comparison_result is False:
+            assert False, baseline_artifact_path + " does not match " + actual_artifact_path
+
+
+def compare_files(file1_path, file2_path):
+    """Compare two files character by character."""
+    with open(file1_path, "r") as file1, open(file2_path, "r") as file2:
+        file1_content = file1.readlines()
+        file2_content = file2.readlines()
+
+    # Using difflib to compare files
+    diff = list(difflib.unified_diff(file1_content, file2_content, fromfile=file1_path, tofile=file2_path))
+
+    if diff:
+        diff_block = "\n" + "\n".join(line.strip() for line in diff)
+        logger.info(f"Files differ:{diff_block}")
+        return False
+
+    logger.info("Files are identical.")
+    return True
+
+
+def compose_hydra_cfg(path, config_name, overrides=[]):
+    """Init and compose a hydra config"""
+    with initialize(version_base=None, config_path=path):
+        return compose(config_name=config_name, overrides=overrides, return_hydra_config=True)
+
+
+def make_hydra_cfg_instance(path, config_name, overrides):
+    """Init hydra instance"""
+    # Note: This is needed if using compose API and not hydra.main b/c we rely on hydra resolver
+    # Open issue tracking fix https://github.com/facebookresearch/hydra/issues/2017
+    config = compose_hydra_cfg(path, config_name, overrides)
+    HydraConfig.instance().set_config(config)
+    return config
diff --git a/validations_wrapper.py b/validations_wrapper.py
new file mode 100644
index 0000000..2d3238e
--- /dev/null
+++ b/validations_wrapper.py
@@ -0,0 +1,38 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+
+from functools import wraps
+from typing import Any, Callable, TypeVar, cast
+
+from omegaconf import DictConfig
+
+from launcher.config_validator.type_validator import TypeValidator
+from launcher.config_validator.value_validator import ValueValidator
+
+_T = TypeVar("_T", bound=Callable[..., Any])
+
+
+def validate_config(fn: _T) -> _T:
+    @wraps(fn)
+    def validations_wrapper(config: DictConfig, *args, **kwargs) -> DictConfig:
+        """
+        Execute all validations in this function
+        """
+        type_validator = TypeValidator(config)
+        type_validator.validate()
+        schema_validator = ValueValidator(config)
+        schema_validator.validate()
+
+        return fn(config, *args, **kwargs)
+
+    return cast(_T, validations_wrapper)