Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vera/ln_tuning add and test case add #1294

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ python run_clm.py \

## PEFT

### LORA/ADALORA/IA3/LLAMA_ADAPTER
### LORA/ADALORA/IA3/LLAMA_ADAPTER/VERA/LN_TUNING

To run LoRA finetuning, you can use `run_lora_clm.py`.
Here are single-/multi-device command examples for Llama1-7B, Falcon-40B, Llama2-70B, Llama3-8B and Llama3-70B.
Expand Down Expand Up @@ -691,7 +691,7 @@ DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 ..
--validation_split_percentage 5 \
--deepspeed ds_falcon_180b_z3.json
```
Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`.
Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`
sywangyi marked this conversation as resolved.
Show resolved Hide resolved

#### Custom Files

Expand Down
31 changes: 29 additions & 2 deletions examples/language-modeling/run_lora_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,17 @@
import torch
import transformers
from datasets import load_dataset
from peft import AdaLoraConfig, AdaptionPromptConfig, IA3Config, LoraConfig, TaskType, get_peft_model, tuners
from peft import (
AdaLoraConfig,
AdaptionPromptConfig,
IA3Config,
LNTuningConfig,
LoraConfig,
TaskType,
VeraConfig,
get_peft_model,
tuners,
)
from peft.utils.other import fsdp_auto_wrap_policy
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -345,7 +355,7 @@ class FinetuneArguments:
default="lora",
metadata={
"help": ("The PEFT type to use."),
"choices": ["lora", "ia3", "adalora", "llama-adapter"],
"choices": ["lora", "ia3", "adalora", "llama-adapter", "vera", "ln_tuning"],
},
)
ia3_target_modules: List[str] = field(
Expand All @@ -364,6 +374,14 @@ class FinetuneArguments:
default=10,
metadata={"help": "Number of adapter tokens to insert in llama-adapter"},
)
vera_target_modules: List[str] = field(
default_factory=lambda: None,
metadata={"help": "Target modules for the vera method."},
)
ln_target_modules: List[str] = field(
default_factory=lambda: None,
metadata={"help": "Target modules for the ln method."},
)


PROMPT_DICT = {
Expand Down Expand Up @@ -839,6 +857,15 @@ def compute_metrics(eval_preds):

tuners.adaption_prompt.layer.AdaptedAttention.pre_attn_forward = GaudiAdaptedAttentionPreAttnForward
tuners.adaption_prompt.layer.AdaptedAttention.__getattr__ = GaudiAdaptedAttention_getattr
elif finetune_args.peft_type == "vera":
peft_config = VeraConfig(
target_modules=finetune_args.vera_target_modules, task_type=TaskType.CAUSAL_LM, init_weights=False
)
elif finetune_args.peft_type == "ln_tuning":
peft_config = LNTuningConfig(
target_modules=finetune_args.ln_target_modules,
task_type=TaskType.CAUSAL_LM,
)
if training_args.gradient_checkpointing:
model.enable_input_require_grads()
lora_model = get_peft_model(model, peft_config)
Expand Down
142 changes: 141 additions & 1 deletion tests/baselines/llama_7b.json
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@
"--max_train_samples 1000",
"--use_habana",
"--ppo_epochs 1",
"--batched_gen True",
"--batched_gen True",
"--mini_batch_size 1",
"--output_max_length 128",
"--input_max_length 128",
Expand Down Expand Up @@ -473,6 +473,146 @@
]
}
}
},
"ia3": {
"num_train_epochs": 3,
"eval_batch_size": 4,
"distribution": {
"multi_card": {
"learning_rate": 3e-4,
"train_batch_size": 8,
"perplexity": 3.3,
"train_runtime": 262.8,
"train_samples_per_second": 161,
"extra_arguments": [
"--bf16",
"--gradient_accumulation_steps 2",
"--eval_strategy no",
"--save_strategy no",
"--warmup_ratio 0.03",
"--lr_scheduler_type constant",
"--max_grad_norm 0.3",
"--logging_steps 1",
"--use_hpu_graphs_for_inference",
"--ia3_target_modules q_proj v_proj",
"--dataset_concatenation",
"--max_seq_length 512",
"--low_cpu_mem_usage True",
"--adam_epsilon 1e-08",
"--ddp_bucket_cap_mb 50",
"--validation_split_percentage 10",
"--attn_softmax_bf16",
"--peft_type ia3"
]
}
}
},
"adalora": {
"num_train_epochs": 3,
"eval_batch_size": 4,
"distribution": {
"multi_card": {
"learning_rate": 3e-4,
"train_batch_size": 8,
"perplexity": 2.59,
"train_runtime": 459,
"train_samples_per_second": 107,
"extra_arguments": [
"--bf16",
"--gradient_accumulation_steps 2",
"--eval_strategy no",
"--save_strategy no",
"--warmup_ratio 0.03",
"--lr_scheduler_type constant",
"--max_grad_norm 0.3",
"--logging_steps 1",
"--use_hpu_graphs_for_inference",
"--lora_alpha 16",
"--lora_dropout 0.05",
"--lora_target_modules q_proj v_proj",
"--adalora_init_r 12",
"--adalora_target_r 4",
"--adalora_tinit 50",
"--adalora_tfinal 500",
"--adalora_delta_t 100",
"--adalora_orth_reg_weight 0.5",
"--dataset_concatenation",
"--max_seq_length 512",
"--low_cpu_mem_usage True",
"--adam_epsilon 1e-08",
"--ddp_bucket_cap_mb 50",
"--validation_split_percentage 10",
"--attn_softmax_bf16",
"--peft_type adalora"
]
}
}
},
"vera": {
"num_train_epochs": 3,
"eval_batch_size": 4,
"distribution": {
"multi_card": {
"learning_rate": 1e-2,
"train_batch_size": 8,
"perplexity": 8.68,
"train_runtime": 318.7,
"train_samples_per_second": 126.6,
"extra_arguments": [
"--bf16",
"--gradient_accumulation_steps 1",
"--eval_strategy no",
"--save_strategy no",
"--warmup_ratio 0.03",
"--lr_scheduler_type constant",
"--max_grad_norm 0.3",
"--logging_steps 1",
"--use_hpu_graphs_for_inference",
"--vera_target_modules q_proj v_proj",
"--dataset_concatenation",
"--max_seq_length 512",
"--low_cpu_mem_usage True",
"--adam_epsilon 1e-08",
"--ddp_bucket_cap_mb 50",
"--validation_split_percentage 10",
"--attn_softmax_bf16",
"--peft_type vera"
]
}
}
},
"ln_tuning": {
"num_train_epochs": 3,
"eval_batch_size": 4,
"distribution": {
"multi_card": {
"learning_rate": 3e-4,
"train_batch_size": 8,
"perplexity": 2.83,
"train_runtime": 249,
"train_samples_per_second": 165,
"extra_arguments": [
"--bf16",
"--gradient_accumulation_steps 2",
"--eval_strategy no",
"--save_strategy no",
"--warmup_ratio 0.03",
"--lr_scheduler_type constant",
"--max_grad_norm 0.3",
"--logging_steps 1",
"--use_hpu_graphs_for_inference",
"--ln_target_module input_layernorm post_attention_layernorm norm",
"--dataset_concatenation",
"--max_seq_length 512",
"--low_cpu_mem_usage True",
"--adam_epsilon 1e-08",
"--ddp_bucket_cap_mb 50",
"--validation_split_percentage 10",
"--attn_softmax_bf16",
"--peft_type ln_tuning"
]
}
}
}
}
}
37 changes: 36 additions & 1 deletion tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,14 @@ def to_test(
return False
elif ("qwen2" in model_name or "Qwen2" in model_name) and task_name == "trl-sft":
return False
elif "falcon" in model_name and task_name in ("llama-adapter", "databricks/databricks-dolly-15k"):
elif "falcon" in model_name and task_name in (
"llama-adapter",
"databricks/databricks-dolly-15k",
"vera",
"ia3",
"adalora",
"ln_tuning",
):
return False
elif model_name not in models_with_specific_rules and not deepspeed:
return True
Expand Down Expand Up @@ -899,3 +906,31 @@ class MultiCardCausalLanguageModelingLoRAFP8ExampleTester(
):
TASK_NAME = "tatsu-lab/alpaca_fp8"
DATASET_NAME = "tatsu-lab/alpaca"


class MultiCardCausalLanguageModelingVeraExampleTester(
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
):
TASK_NAME = "vera"
DATASET_NAME = "tatsu-lab/alpaca"


class MultiCardCausalLanguageModelingLnExampleTester(
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
):
TASK_NAME = "ln_tuning"
DATASET_NAME = "tatsu-lab/alpaca"


class MultiCardCausalLanguageModelingIA3ExampleTester(
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
):
TASK_NAME = "ia3"
DATASET_NAME = "tatsu-lab/alpaca"


class MultiCardCausalLanguageModelingAdaloraExampleTester(
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
):
TASK_NAME = "adalora"
DATASET_NAME = "tatsu-lab/alpaca"
Loading