GH Task Runner (Large Suite 1) #4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: GH Task Runner (Large Suite 1) | |
on: | |
workflow_dispatch: | |
inputs: | |
approval_notice: | |
description: 'WARNING: This will spin up a large number of tasks - get approval from admin before running' | |
required: false | |
default: 'NOT_APPROVED' | |
type: choice | |
options: | |
- NOT_APPROVED | |
- APPROVED | |
model_hf_repo: | |
description: 'Model Hugging Face Repository' | |
required: true | |
default: 'RWKV/rwkv-5-world-1b5' | |
model_args: | |
description: 'Model Arguments' | |
required: false | |
default: 'dtype="float32",trust_remote_code=True' | |
batch_size: | |
description: 'Batch Size' | |
required: true | |
default: 'auto' | |
backend: | |
description: 'Backend to use' | |
required: true | |
default: 'nvidia-gpu' | |
type: choice | |
options: | |
- nvidia-gpu | |
- intel-gpu | |
- amd-gpu | |
- any-gpu | |
gpu_vram: | |
description: 'Minimum GPU VRAM (ignored for MPS)' | |
required: true | |
default: '24' | |
type: choice | |
options: | |
- 16 | |
- 24 | |
- 40 | |
- 48 | |
- 80 | |
num_fewshot: | |
description: 'num_fewshot setting (ignored if < 0)' | |
required: true | |
default: -1 | |
env: | |
# Get the final task | |
RUN_TASK: ${{ github.event.inputs.custom_task || github.event.inputs.run_task }} | |
# HF repo to sync to | |
HF_REPO_SYNC: rwkv-x-dev/lm-eval-output | |
# Model HF repo | |
MODEL_HF_REPO: ${{ github.event.inputs.model_hf_repo }} | |
# Secrets | |
HUGGING_FACE_HUB_TOKEN: ${{secrets.HUGGING_FACE_HUB_TOKEN}} | |
jobs: | |
gh-task-runner-large-suite-1: | |
# Check for approval notice | |
if: ${{ github.event.inputs.approval_notice == 'APPROVED' }} | |
# Strategy Matrix | |
strategy: | |
# Disable fail-fast behavior | |
fail-fast: false | |
matrix: | |
# NOTE: There is a matrix limit of 256 on github | |
run_task: | |
- anli | |
- arc_easy | |
- arc_challenge | |
- ai2_arc | |
- anagrams* | |
- anli_* | |
- advanced_ai_risk | |
- advanced_ai_risk_fewshot-* | |
- advanced_ai_risk_human-* | |
- advanced_ai_risk_lm-* | |
- arithmetic | |
- arithmetic_* | |
- asdiv | |
- babi | |
- bbh | |
- bbh_cot_fewshot | |
- bbh_cot_fewshot_* | |
- bbh_cot_zeroshot | |
- bbh_cot_zeroshot_* | |
- bbh_fewshot | |
- bbh_fewshot_* | |
- bbh_zeroshot | |
- bbh_zeroshot_* | |
- belebele | |
- belebele_* | |
- bigbench_* | |
- blimp | |
- blimp_* | |
- boolq | |
- boolq-seq2seq | |
- cb | |
- ceval-valid | |
- ceval-valid_* | |
- chain_of_thought | |
- cmmlu | |
- cmmlu_* | |
- code2text_* | |
- codexglue_code2text | |
- cola | |
- copa | |
- coqa | |
- crows_pairs | |
- crows_pairs_* | |
- csatqa | |
- csatqa_* | |
- cycle_letters | |
- drop | |
- ethics_* | |
- flan_held_* | |
- fld | |
- fld_* | |
- freebase | |
- generate_until | |
- glue | |
- gpt3_translation_benchmarks | |
- gsm8k | |
- gsm8k_cot | |
- gsm8k_cot_self_consistency | |
- headqa | |
- headqa_en | |
- headqa_es | |
- hellaswag | |
- hellaswag_* | |
- hendrycks_ethics | |
- ifeval | |
- iwslt2017 | |
- iwslt2017-* | |
- kmmlu | |
- kmmlu_* | |
- kobest | |
- kobest_* | |
- lambada | |
- lambada_* | |
- logieval | |
- logiqa | |
- logiqa2 | |
- loglikelihood | |
- math_word_problems | |
- mathqa | |
- mc_taco | |
- medmcqa | |
- medqa_4options | |
- mgsm_* | |
- minerva_math | |
- minerva_math_* | |
- mmlu | |
- mmlu_* | |
- mnli | |
- mnli_mismatch | |
- mrpc | |
- multimedqa | |
- multiple_choice | |
- multirc | |
- mutual | |
- mutual_plus | |
- nq_open | |
- openbookqa | |
- paws_* | |
- pawsx | |
- persona | |
- persona_* | |
- pile | |
- pile_* | |
- piqa | |
- polemo2 | |
- polemo2_* | |
- prost | |
- pubmedqa | |
- pythia | |
- qa4mre | |
- qa4mre_* | |
- qasper | |
- qasper_* | |
- qnli | |
- qqp | |
- race | |
- random_insertion | |
- realtoxicityprompts | |
- record | |
- reversed_words | |
- rte | |
- sciq | |
- scrolls | |
- self_consistency | |
- sglue_rte | |
- social_bias | |
- social_iqa | |
- squadv2 | |
- sst2 | |
- storycloze | |
- storycloze_* | |
- super-glue-* | |
- swag | |
- sycophancy | |
- sycophancy_on_* | |
- t0_eval | |
- toxigen | |
- translation | |
- triviaqa | |
- truthfulqa | |
- truthfulqa_* | |
- unscramble | |
- webqs | |
- wic | |
- wikitext | |
- winogrande | |
- wmt-ro-en-t5-prompt | |
- wmt-t5-prompt | |
- wmt14 | |
- wmt14-* | |
- wmt16 | |
- wmt16-* | |
- wnli | |
- wsc | |
- wsc273 | |
- xcopa | |
- xcopa_* | |
- xnli | |
- xnli_* | |
- xstorycloze | |
- xstorycloze_* | |
- xwinograd | |
- xwinograd_* | |
# Name of the job | |
name: "[${{ matrix.run_task }}] ${{ github.event.inputs.model_hf_repo }} - ${{ github.event.inputs.model_args }}" | |
# Due to github worker hard limitation, of 24 hours | |
# we apply a timeout of 23 hours instead. | |
timeout-minutes: 1380 | |
# Select the type of runner that the job will run on | |
runs-on: | |
- ${{ github.event.inputs.backend }} | |
- gpu-vram-${{ github.event.inputs.gpu_vram }} | |
# Actual task setup, and run steps | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v3 | |
- uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
- name: Install dependencies / setup project | |
run: | | |
# Basic dependencies install, and output setup | |
mkdir -p ./output | |
python -m pip install . | |
python -m pip install -e . | |
# Setup HF cache | |
chmod +x ./gh-task-runner/*.sh | |
./gh-task-runner/hf-cache-setup.sh | |
- name: Run Task | |
run: | | |
# Run it | |
echo "# ------------------------------" | |
echo "# Running Task ...." | |
echo "# ------------------------------" | |
# Get the final task to run | |
task_to_run=${{ matrix.run_task }} | |
# Check if the few shot setting is larger or euqal to 0 | |
if [ ${{ github.event.inputs.num_fewshot }} -ge 0 ]; then | |
# Fail on pipe error | |
set -o pipefail | |
# Run it | |
accelerate launch -m lm_eval --model hf \ | |
--model_args pretrained=${{ github.event.inputs.model_hf_repo }},${{ github.event.inputs.model_args }} \ | |
--tasks $task_to_run \ | |
--batch_size ${{ github.event.inputs.batch_size }} \ | |
--device mps \ | |
--num_fewshot ${{ github.event.inputs.num_fewshot }} \ | |
--log_samples --output_path ./output 2>&1 | tee -a ./output/taskrun.log | |
else | |
# Fail on pipe error | |
set -o pipefail | |
# Run it | |
accelerate launch -m lm_eval --model hf \ | |
--model_args pretrained=${{ github.event.inputs.model_hf_repo }},${{ github.event.inputs.model_args }} \ | |
--tasks $task_to_run \ | |
--batch_size ${{ github.event.inputs.batch_size }} \ | |
--device mps \ | |
--log_samples --output_path ./output 2>&1 | tee -a ./output/taskrun.log | |
fi | |
- name: Upload outputs to HF | |
if: always() | |
run: | | |
CLEANED_TASK=$(echo "${{ matrix.run_task }}" | sed 's/\*/_/g') | |
HF_SUBDIR_PATH="${{ env.MODEL_HF_REPO }}/$CLEANED_TASK/${{ github.event.inputs.model_args }}-num_fewshot=${{ github.event.inputs.num_fewshot }}/${{ github.event.inputs.backend }}/" | |
./gh-task-runner/hf-upload-runner.sh "${{ env.HF_REPO_SYNC }}" "$HF_SUBDIR_PATH" "./output" | |
# Note that this is meant to be a contigency measure, in case the HF upload failed | |
- name: Save output Files | |
uses: actions/upload-artifact@v3 | |
# if: failure() | |
if: always() | |
with: | |
name: output-files | |
path: | | |
output/* | |
retention-days: 365 | |