diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index 57cac19713..b9eb6e1fcf 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -362,7 +362,7 @@ python run_clm.py \ ## PEFT -### LORA/ADALORA/IA3/LLAMA_ADAPTER +### LORA/ADALORA/IA3/LLAMA_ADAPTER/VERA/LN_TUNING To run LoRA finetuning, you can use `run_lora_clm.py`. Here are single-/multi-device command examples for Llama1-7B, Falcon-40B, Llama2-70B, Llama3-8B and Llama3-70B. @@ -720,7 +720,7 @@ DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 .. --validation_split_percentage 5 \ --deepspeed ds_falcon_180b_z3.json ``` -Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`. +Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`. #### Custom Files diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py index 8b358ccbfe..ebbcc2e4d0 100644 --- a/examples/language-modeling/run_lora_clm.py +++ b/examples/language-modeling/run_lora_clm.py @@ -30,7 +30,17 @@ import torch import transformers from datasets import load_dataset -from peft import AdaLoraConfig, AdaptionPromptConfig, IA3Config, LoraConfig, TaskType, get_peft_model, tuners +from peft import ( + AdaLoraConfig, + AdaptionPromptConfig, + IA3Config, + LNTuningConfig, + LoraConfig, + TaskType, + VeraConfig, + get_peft_model, + tuners, +) from peft.utils.other import fsdp_auto_wrap_policy from transformers import ( AutoConfig, @@ -349,7 +359,7 @@ class FinetuneArguments: default="lora", metadata={ "help": ("The PEFT type to use."), - "choices": ["lora", "ia3", "adalora", "llama-adapter"], + "choices": ["lora", "ia3", "adalora", "llama-adapter", "vera", "ln_tuning"], }, ) ia3_target_modules: List[str] = field( @@ -368,6 +378,14 @@ class FinetuneArguments: default=10, metadata={"help": "Number of adapter tokens to insert in llama-adapter"}, ) + vera_target_modules: List[str] = field( + default_factory=lambda: None, + metadata={"help": "Target modules for the vera method."}, + ) + ln_target_modules: List[str] = field( + default_factory=lambda: None, + metadata={"help": "Target modules for the ln method."}, + ) PROMPT_DICT = { @@ -884,6 +902,15 @@ def compute_metrics(eval_preds): tuners.adaption_prompt.layer.AdaptedAttention.pre_attn_forward = GaudiAdaptedAttentionPreAttnForward tuners.adaption_prompt.layer.AdaptedAttention.__getattr__ = GaudiAdaptedAttention_getattr + elif finetune_args.peft_type == "vera": + peft_config = VeraConfig( + target_modules=finetune_args.vera_target_modules, task_type=TaskType.CAUSAL_LM, init_weights=False + ) + elif finetune_args.peft_type == "ln_tuning": + peft_config = LNTuningConfig( + target_modules=finetune_args.ln_target_modules, + task_type=TaskType.CAUSAL_LM, + ) if training_args.gradient_checkpointing: model.enable_input_require_grads() lora_model = get_peft_model(model, peft_config) diff --git a/tests/baselines/llama_7b.json b/tests/baselines/llama_7b.json index 1c303c9d9c..5d3ef45d24 100644 --- a/tests/baselines/llama_7b.json +++ b/tests/baselines/llama_7b.json @@ -349,7 +349,7 @@ "--max_train_samples 1000", "--use_habana", "--ppo_epochs 1", - "--batched_gen True", + "--batched_gen True", "--mini_batch_size 1", "--output_max_length 128", "--input_max_length 128", @@ -473,6 +473,146 @@ ] } } + }, + "ia3": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 3e-4, + "train_batch_size": 8, + "perplexity": 3.3, + "train_runtime": 262.8, + "train_samples_per_second": 161, + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--ia3_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type ia3" + ] + } + } + }, + "adalora": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 3e-4, + "train_batch_size": 8, + "perplexity": 2.59, + "train_runtime": 459, + "train_samples_per_second": 107, + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--lora_alpha 16", + "--lora_dropout 0.05", + "--lora_target_modules q_proj v_proj", + "--adalora_init_r 12", + "--adalora_target_r 4", + "--adalora_tinit 50", + "--adalora_tfinal 500", + "--adalora_delta_t 100", + "--adalora_orth_reg_weight 0.5", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type adalora" + ] + } + } + }, + "vera": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 1e-2, + "train_batch_size": 8, + "perplexity": 8.68, + "train_runtime": 318.7, + "train_samples_per_second": 126.6, + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 1", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--vera_target_modules q_proj v_proj", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type vera" + ] + } + } + }, + "ln_tuning": { + "num_train_epochs": 3, + "eval_batch_size": 4, + "distribution": { + "multi_card": { + "learning_rate": 3e-4, + "train_batch_size": 8, + "perplexity": 2.83, + "train_runtime": 249, + "train_samples_per_second": 165, + "extra_arguments": [ + "--bf16", + "--gradient_accumulation_steps 2", + "--eval_strategy no", + "--save_strategy no", + "--warmup_ratio 0.03", + "--lr_scheduler_type constant", + "--max_grad_norm 0.3", + "--logging_steps 1", + "--use_hpu_graphs_for_inference", + "--ln_target_module input_layernorm post_attention_layernorm norm", + "--dataset_concatenation", + "--max_seq_length 512", + "--low_cpu_mem_usage True", + "--adam_epsilon 1e-08", + "--ddp_bucket_cap_mb 50", + "--validation_split_percentage 10", + "--attn_softmax_bf16", + "--peft_type ln_tuning" + ] + } + } } } } diff --git a/tests/test_examples.py b/tests/test_examples.py index 3670b32693..c126cace3d 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -240,13 +240,21 @@ def to_test( or "prompt_tuning" in example_name or "peft_poly" in example_name or example_name == "run_sequence_classification" + or task_name in ("llama-adapter", "vera", "ia3", "adalora", "ln_tuning", "mamamiya405/finred") ) and not IS_GAUDI2: return False elif "llama" in model_name and "trl-sft-chat" in task_name: return False elif ("qwen2" in model_name or "Qwen2" in model_name) and task_name == "trl-sft": return False - elif "falcon" in model_name and task_name in ("llama-adapter", "databricks/databricks-dolly-15k"): + elif "falcon" in model_name and task_name in ( + "llama-adapter", + "databricks/databricks-dolly-15k", + "vera", + "ia3", + "adalora", + "ln_tuning", + ): return False elif model_name not in models_with_specific_rules and not deepspeed: return True @@ -899,3 +907,31 @@ class MultiCardCausalLanguageModelingLoRAFP8ExampleTester( ): TASK_NAME = "tatsu-lab/alpaca_fp8" DATASET_NAME = "tatsu-lab/alpaca" + + +class MultiCardCausalLanguageModelingVeraExampleTester( + ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True +): + TASK_NAME = "vera" + DATASET_NAME = "tatsu-lab/alpaca" + + +class MultiCardCausalLanguageModelingLnExampleTester( + ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True +): + TASK_NAME = "ln_tuning" + DATASET_NAME = "tatsu-lab/alpaca" + + +class MultiCardCausalLanguageModelingIA3ExampleTester( + ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True +): + TASK_NAME = "ia3" + DATASET_NAME = "tatsu-lab/alpaca" + + +class MultiCardCausalLanguageModelingAdaloraExampleTester( + ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True +): + TASK_NAME = "adalora" + DATASET_NAME = "tatsu-lab/alpaca"